In [1]:import matplotlib.pyplot as plt %matplotlib inline import gensim import nltk nltk.download("reuters") nltk.download("stopwords") nltk.download("punkt") [nltk_data] Downloading package reuters to /home/kato/nltk_data... [nltk_data] Package reuters is already up-to-date! [nltk_data] Downloading package stopwords to /home/kato/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /home/kato/nltk_data... [nltk_data] Package punkt is already up-to-date! Out [1]:True In [2]:reuters = nltk.corpus.reuters paras = reuters.paras() In [3]:paras[0][0][:20] Out [3]:['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.'] In [4]:stop_words = set(nltk.corpus.stopwords.words("english")) In [5]:psorted(stop_words)[:10] Out [5]:['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an'] In [6]:import itertools reuters_texts = [itertools.chain(*p) for p in paras] reuters_texts_filtered = [[w.lower() for w in p if w not in stop_words and len(w) >= 2] for p in reuters_texts] In [7]:dictionary = gensim.corpora.Dictionary(itertools.chain(reuters_texts_filtered)) reuters_corpus = [dictionary.doc2bow(text) for text in reuters_texts_filtered] In [8]:reuters_corpus[0][:10] Out [8]:[(0, 1), (1, 6), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)] In [9]:dictionary[0] model=gensim.models.ldamodel.LdaModel(reuters_corpus, num_topics=20,id2word=dictionary.id2token, random_state=0) In [10]:model.print_topics() Out [10]:[(0, '0.052*"bank" + 0.037*"mln" + 0.034*"stg" + 0.023*"said" + 0.018*"the" + 0.015*"market" + 0.013*"pct" + 0.011*"money" + 0.011*"billion" + 0.011*"dlrs"'), (1, '0.032*"pct" + 0.025*"said" + 0.024*"billion" + 0.016*"the" + 0.013*"year" + 0.010*"bank" + 0.007*"government" + 0.007*"trade" + 0.007*"foreign" + 0.007*"last"'), (2, '0.028*"lt" + 0.023*"said" + 0.017*"inc" + 0.017*"corp" + 0.015*"dlrs" + 0.015*"mln" + 0.012*"unit" + 0.012*"co" + 0.009*"company" + 0.008*"division"'), (3, '0.044*"said" + 0.028*"company" + 0.020*"dlrs" + 0.013*"lt" + 0.012*"share" + 0.011*"the" + 0.010*"inc" + 0.008*"offer" + 0.008*"would" + 0.007*"mln"'), (4, '0.046*"january" + 0.044*"billion" + 0.041*"february" + 0.032*"dlrs" + 0.031*"pct" + 0.027*"mln" + 0.023*"year" + 0.021*"rose" + 0.017*"1986" + 0.016*"december"'), (5, '0.024*"said" + 0.017*"dlrs" + 0.014*"the" + 0.014*"year" + 0.011*"mln" + 0.008*"pct" + 0.008*"bank" + 0.007*"banks" + 0.006*"company" + 0.006*"week"'), (6, '0.063*"cts" + 0.035*"april" + 0.033*"record" + 0.032*"lt" + 0.027*"div" + 0.024*"vs" + 0.024*"qtly" + 0.023*"pay" + 0.023*"prior" + 0.021*"march"'), (7, '0.027*"said" + 0.024*"dollar" + 0.012*"yen" + 0.011*"rates" + 0.011*"exchange" + 0.011*"currency" + 0.011*"paris" + 0.010*"the" + 0.009*"baker" + 0.009*",""'), (8, '0.052*"pct" + 0.033*"shares" + 0.033*"said" + 0.017*"stake" + 0.017*"lt" + 0.016*"offer" + 0.016*"group" + 0.010*"the" + 0.010*"common" + 0.009*"stock"'), (9, '0.037*"said" + 0.016*"oil" + 0.014*"the" + 0.010*","" + 0.009*"prices" + 0.008*"pct" + 0.007*"would" + 0.007*"year" + 0.006*"price" + 0.005*"market"'), (10, '0.031*"mln" + 0.027*"said" + 0.024*"tonnes" + 0.022*"000" + 0.015*"the" + 0.013*"year" + 0.011*"1986" + 0.009*"wheat" + 0.009*"dlrs" + 0.007*"production"'), (11, '0.031*"said" + 0.017*"lt" + 0.010*"the" + 0.010*"stock" + 0.009*"delegates" + 0.008*"buffer" + 0.008*"cocoa" + 0.007*"gulf" + 0.006*"ltd" + 0.005*"would"'), (12, '0.032*"said" + 0.010*"coffee" + 0.009*"the" + 0.008*"would" + 0.007*"year" + 0.006*"trade" + 0.006*","" + 0.006*"brazil" + 0.006*"china" + 0.006*"export"'), (13, '0.029*"said" + 0.021*"trade" + 0.017*"the" + 0.016*"would" + 0.008*"agreement" + 0.007*"japan" + 0.007*"ec" + 0.007*","" + 0.006*"bill" + 0.006*".""'), (14, '0.030*"dlrs" + 0.029*"said" + 0.020*"lt" + 0.015*"pct" + 0.011*"the" + 0.011*"corp" + 0.009*"cyclops" + 0.009*"usair" + 0.008*"federal" + 0.008*"company"'), (15, '0.034*"said" + 0.032*"lt" + 0.026*"mln" + 0.018*"stock" + 0.017*"dlrs" + 0.015*"company" + 0.015*"inc" + 0.013*"the" + 0.010*"shares" + 0.010*"corp"'), (16, '0.019*"said" + 0.011*"canada" + 0.009*"trade" + 0.009*"the" + 0.008*"would" + 0.008*"canadian" + 0.007*"chrysler" + 0.005*"lawson" + 0.005*"government" + 0.005*",""'), (17, '0.014*")," + 0.012*"plant" + 0.010*"said" + 0.009*"to" + 0.008*"saudi" + 0.008*"aluminium" + 0.007*"bolivia" + 0.007*"the" + 0.007*"nil" + 0.007*"corn"'), (18, '0.088*"vs" + 0.070*"mln" + 0.048*"000" + 0.042*"net" + 0.041*"cts" + 0.035*"loss" + 0.029*"dlrs" + 0.026*"shr" + 0.020*"profit" + 0.016*"qtr"'), (19, '0.020*"said" + 0.012*"the" + 0.009*"port" + 0.009*"shipping" + 0.008*"iran" + 0.008*"gulf" + 0.007*"soviet" + 0.007*"ship" + 0.007*"unemployment" + 0.006*"ships"')] In [11]:pred, _ = model.inference(reuters_corpus) pred /= pred.sum(axis=1).reshape(-1, 1) In [12]:fig, axes = plt.subplots(1, 5, figsize=(15, 3)) for i in range(5): axes[i].set_ylim(0, 1) axes[i].bar(range(20), pred[i]) In [13]:pred[0].argmax() Out [13]:13 In [14]:fileids = reuters.fileids() print(reuters.raw(fileids[0])[:300]) ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT Mounting trade friction between the U.S. And Japan has raised fears among many of Asia's exporting nations that the row could inflict far-reaching economic damage, businessmen and officials said. They told Reuter correspondents in Asian In [15]:import csv quora = [] with open("data/quora_duplicate_questions.tsv") as fp: reader = csv.reader(fp, delimiter="\t") next(reader) for row in reader: quora.append(nltk.tokenize.word_tokenize(row[3].lower())) print(quora[:10]) [['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india', '?'], ['what', 'is', 'the', 'story', 'of', 'kohinoor', '(', 'koh-i-noor', ')', 'diamond', '?'], ['how', 'can', 'i', 'increase', 'the', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'a', 'vpn', '?'], ['why', 'am', 'i', 'mentally', 'very', 'lonely', '?', 'how', 'can', 'i', 'solve', 'it', '?'], ['which', 'one', 'dissolve', 'in', 'water', 'quikly', 'sugar', ',', 'salt', ',', 'methane', 'and', 'carbon', 'di', 'oxide', '?'], ['astrology', ':', 'i', 'am', 'a', 'capricorn', 'sun', 'cap', 'moon', 'and', 'cap', 'rising', '...', 'what', 'does', 'that', 'say', 'about', 'me', '?'], ['should', 'i', 'buy', 'tiago', '?'], ['how', 'can', 'i', 'be', 'a', 'good', 'geologist', '?'], ['when', 'do', 'you', 'use', 'ƒV', 'instead', 'of', '‚µ', '?'], ['motorola', '(', 'company', ')', ':', 'can', 'i', 'hack', 'my', 'charter', 'motorolla', 'dcx3400', '?']] In [16]:quora_filtered = [] for sentence in quora: quora_filtered.append([w for w in sentence if w not in stop_words and len(w) > 1]) print(quora_filtered[:10]) [['step', 'step', 'guide', 'invest', 'share', 'market', 'india'], ['story', 'kohinoor', 'koh-i-noor', 'diamond'], ['increase', 'speed', 'internet', 'connection', 'using', 'vpn'], ['mentally', 'lonely', 'solve'], ['one', 'dissolve', 'water', 'quikly', 'sugar', 'salt', 'methane', 'carbon', 'di', 'oxide'], ['astrology', 'capricorn', 'sun', 'cap', 'moon', 'cap', 'rising', '...', 'say'], ['buy', 'tiago'], ['good', 'geologist'], ['use', 'instead'], ['motorola', 'company', 'hack', 'charter', 'motorolla', 'dcx3400']] In [17]:model = gensim.models.Word2Vec(quora_filtered, size=50, window=5) In [18]:model.wv.similar_by_word("sick") [('dizzy', 0.8926801681518555), ('hungry', 0.8812119960784912), ('angry', 0.8726974129676819), ('tired', 0.8566402196884155), ('jealous', 0.8490884900093079), ('uncomfortable', 0.8480300903320312), ('bullied', 0.844206690788269), ('drunk', 0.8416849374771118), ('horny', 0.8360753059387207), ('rude', 0.8356759548187256)] In [19]:model.wv.similar_by_vector("actor") [('actress', 0.9272599220275879), ('singer', 0.907437801361084), ('bollywood', 0.8912582397460938), ('actors', 0.8538996577262878), ('remade', 0.8361212015151978), ('actor/actress', 0.8340005874633789), ('hollywood', 0.8316481113433838), ('films', 0.8285355567932129), ('superhero', 0.8265222907066345), ('comedy', 0.8245402574539185)]