
下面是内存溢出 jb51.cc 通过网络收集整理的代码片段。
内存溢出小编现在分享给大家,也给大家做个参考。
# Coding=UTF-8import nltkfrom nltk.corpus import brown # This is a fast and simple noun phrase extractor (based on NLTK)# Feel free to use it,just keep a link back to this post# http://thetokenizer.com/2013/05/09/efficIEnt-way-to-extract-the-main-topics-of-a-sentence/# Create by Shlomi Babluki# May,2013 # This is our fast Part of Speech tagger#############################################################################brown_train = brown.tagged_sents(categorIEs='news')regexp_tagger = nltk.RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$','CD'),(r'(-|:|;)$',':'),(r'\'*$','MD'),(r'(The|the|A|a|An|an)$','AT'),(r'.*able$','JJ'),(r'^[A-Z].*$','NNP'),(r'.*ness$','NN'),(r'.*ly$','RB'),(r'.*s$','NNS'),(r'.*ing$','VBG'),(r'.*ed$','VBD'),(r'.*','NN')])unigram_tagger = nltk.UnigramTagger(brown_train,backoff=regexp_tagger)bigram_tagger = nltk.BigramTagger(brown_train,backoff=unigram_tagger)############################################################################# # This is our semi-CFG; Extend it according to your own needs#############################################################################cfg = {}cfg["NNP+NNP"] = "NNP"cfg["NN+NN"] = "NNI"cfg["NNI+NN"] = "NNI"cfg["JJ+JJ"] = "JJ"cfg["JJ+NN"] = "NNI"############################################################################# class NPExtractor(object): def __init__(self,sentence): self.sentence = sentence # Split the sentence into singlw words/tokens def tokenize_sentence(self,sentence): tokens = nltk.word_tokenize(sentence) return tokens # normalize brown corpus' Tags ("NN","NN-PL","NNS" > "NN") def normalize_Tags(self,tagged): n_tagged = [] for t in tagged: if t[1] == "NP-TL" or t[1] == "NP": n_tagged.append((t[0],"NNP")) continue if t[1].endswith("-TL"): n_tagged.append((t[0],t[1][:-3])) continue if t[1].endswith("S"): n_tagged.append((t[0],t[1][:-1])) continue n_tagged.append((t[0],t[1])) return n_tagged # Extract the main topics from the sentence def extract(self): tokens = self.tokenize_sentence(self.sentence) Tags = self.normalize_Tags(bigram_tagger.tag(tokens)) merge = True while merge: merge = False for x in range(0,len(Tags) - 1): t1 = Tags[x] t2 = Tags[x + 1] key = "%s+%s" % (t1[1],t2[1]) value = cfg.get(key,'') if value: merge = True Tags.pop(x) Tags.pop(x) match = "%s %s" % (t1[0],t2[0]) pos = value Tags.insert(x,(match,pos)) break matches = [] for t in Tags: if t[1] == "NNP" or t[1] == "NNI": #if t[1] == "NNP" or t[1] == "NNI" or t[1] == "NN": matches.append(t[0]) return matches # Main method,just run "python np_extractor.py"def main(): sentence = "Swayy is a beautiful new dashboard for discovering and curating online content." np_extractor = NPExtractor(sentence) result = np_extractor.extract() print "This sentence is about: %s" % ",".join(result) if __name__ == '__main__': main() 以上是内存溢出(jb51.cc)为你收集整理的全部代码内容,希望文章能够帮你解决所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
总结以上是内存溢出为你收集整理的一个非常高效的提取内容关键词的python代码全部内容,希望文章能够帮你解决一个非常高效的提取内容关键词的python代码所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)