In [1]:import matplotlib.pyplot as plt
%matplotlib inline
import gensim
import nltk
nltk.download("reuters")
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package reuters to /home/kato/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to /home/kato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kato/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Out [1]:True

In [2]:reuters = nltk.corpus.reuters
paras = reuters.paras()

In [3]:paras[0][0][:20]

Out [3]:['ASIAN',
 'EXPORTERS',
 'FEAR',
 'DAMAGE',
 'FROM',
 'U',
 '.',
 'S',
 '.-',
 'JAPAN',
 'RIFT',
 'Mounting',
 'trade',
 'friction',
 'between',
 'the',
 'U',
 '.',
 'S',
 '.']

In [4]:stop_words = set(nltk.corpus.stopwords.words("english"))

In [5]:psorted(stop_words)[:10]

Out [5]:['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [6]:import itertools
reuters_texts = [itertools.chain(*p) for p in paras]
reuters_texts_filtered = [[w.lower() for w in p if w not in stop_words and len(w) >= 2] for p in reuters_texts]


In [7]:dictionary = gensim.corpora.Dictionary(itertools.chain(reuters_texts_filtered))
reuters_corpus = [dictionary.doc2bow(text) for text in reuters_texts_filtered]

In [8]:reuters_corpus[0][:10]

Out [8]:[(0, 1),
 (1, 6),
 (2, 1),
 (3, 1),
 (4, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1)]

In [9]:dictionary[0]
model=gensim.models.ldamodel.LdaModel(reuters_corpus, num_topics=20,id2word=dictionary.id2token, random_state=0)

In [10]:model.print_topics()

Out [10]:[(0,
  '0.052*"bank" + 0.037*"mln" + 0.034*"stg" + 0.023*"said" + 0.018*"the" + 0.015*"market" + 0.013*"pct" + 0.011*"money" + 0.011*"billion" + 0.011*"dlrs"'),
 (1,
  '0.032*"pct" + 0.025*"said" + 0.024*"billion" + 0.016*"the" + 0.013*"year" + 0.010*"bank" + 0.007*"government" + 0.007*"trade" + 0.007*"foreign" + 0.007*"last"'),
 (2,
  '0.028*"lt" + 0.023*"said" + 0.017*"inc" + 0.017*"corp" + 0.015*"dlrs" + 0.015*"mln" + 0.012*"unit" + 0.012*"co" + 0.009*"company" + 0.008*"division"'),
 (3,
  '0.044*"said" + 0.028*"company" + 0.020*"dlrs" + 0.013*"lt" + 0.012*"share" + 0.011*"the" + 0.010*"inc" + 0.008*"offer" + 0.008*"would" + 0.007*"mln"'),
 (4,
  '0.046*"january" + 0.044*"billion" + 0.041*"february" + 0.032*"dlrs" + 0.031*"pct" + 0.027*"mln" + 0.023*"year" + 0.021*"rose" + 0.017*"1986" + 0.016*"december"'),
 (5,
  '0.024*"said" + 0.017*"dlrs" + 0.014*"the" + 0.014*"year" + 0.011*"mln" + 0.008*"pct" + 0.008*"bank" + 0.007*"banks" + 0.006*"company" + 0.006*"week"'),
 (6,
  '0.063*"cts" + 0.035*"april" + 0.033*"record" + 0.032*"lt" + 0.027*"div" + 0.024*"vs" + 0.024*"qtly" + 0.023*"pay" + 0.023*"prior" + 0.021*"march"'),
 (7,
  '0.027*"said" + 0.024*"dollar" + 0.012*"yen" + 0.011*"rates" + 0.011*"exchange" + 0.011*"currency" + 0.011*"paris" + 0.010*"the" + 0.009*"baker" + 0.009*",""'),
 (8,
  '0.052*"pct" + 0.033*"shares" + 0.033*"said" + 0.017*"stake" + 0.017*"lt" + 0.016*"offer" + 0.016*"group" + 0.010*"the" + 0.010*"common" + 0.009*"stock"'),
 (9,
  '0.037*"said" + 0.016*"oil" + 0.014*"the" + 0.010*","" + 0.009*"prices" + 0.008*"pct" + 0.007*"would" + 0.007*"year" + 0.006*"price" + 0.005*"market"'),
 (10,
  '0.031*"mln" + 0.027*"said" + 0.024*"tonnes" + 0.022*"000" + 0.015*"the" + 0.013*"year" + 0.011*"1986" + 0.009*"wheat" + 0.009*"dlrs" + 0.007*"production"'),
 (11,
  '0.031*"said" + 0.017*"lt" + 0.010*"the" + 0.010*"stock" + 0.009*"delegates" + 0.008*"buffer" + 0.008*"cocoa" + 0.007*"gulf" + 0.006*"ltd" + 0.005*"would"'),
 (12,
  '0.032*"said" + 0.010*"coffee" + 0.009*"the" + 0.008*"would" + 0.007*"year" + 0.006*"trade" + 0.006*","" + 0.006*"brazil" + 0.006*"china" + 0.006*"export"'),
 (13,
  '0.029*"said" + 0.021*"trade" + 0.017*"the" + 0.016*"would" + 0.008*"agreement" + 0.007*"japan" + 0.007*"ec" + 0.007*","" + 0.006*"bill" + 0.006*".""'),
 (14,
  '0.030*"dlrs" + 0.029*"said" + 0.020*"lt" + 0.015*"pct" + 0.011*"the" + 0.011*"corp" + 0.009*"cyclops" + 0.009*"usair" + 0.008*"federal" + 0.008*"company"'),
 (15,
  '0.034*"said" + 0.032*"lt" + 0.026*"mln" + 0.018*"stock" + 0.017*"dlrs" + 0.015*"company" + 0.015*"inc" + 0.013*"the" + 0.010*"shares" + 0.010*"corp"'),
 (16,
  '0.019*"said" + 0.011*"canada" + 0.009*"trade" + 0.009*"the" + 0.008*"would" + 0.008*"canadian" + 0.007*"chrysler" + 0.005*"lawson" + 0.005*"government" + 0.005*",""'),
 (17,
  '0.014*")," + 0.012*"plant" + 0.010*"said" + 0.009*"to" + 0.008*"saudi" + 0.008*"aluminium" + 0.007*"bolivia" + 0.007*"the" + 0.007*"nil" + 0.007*"corn"'),
 (18,
  '0.088*"vs" + 0.070*"mln" + 0.048*"000" + 0.042*"net" + 0.041*"cts" + 0.035*"loss" + 0.029*"dlrs" + 0.026*"shr" + 0.020*"profit" + 0.016*"qtr"'),
 (19,
  '0.020*"said" + 0.012*"the" + 0.009*"port" + 0.009*"shipping" + 0.008*"iran" + 0.008*"gulf" + 0.007*"soviet" + 0.007*"ship" + 0.007*"unemployment" + 0.006*"ships"')]


In [11]:pred, _ = model.inference(reuters_corpus)
pred /= pred.sum(axis=1).reshape(-1, 1)


In [12]:fig, axes = plt.subplots(1, 5, figsize=(15, 3))
for i in range(5):
    axes[i].set_ylim(0, 1)
    axes[i].bar(range(20), pred[i])

In [13]:pred[0].argmax()

Out [13]:13

In [14]:fileids = reuters.fileids()
print(reuters.raw(fileids[0])[:300])

ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT
  Mounting trade friction between the
  U.S. And Japan has raised fears among many of Asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      They told Reuter correspondents in Asian 

In [15]:import csv
quora = []
with open("data/quora_duplicate_questions.tsv") as fp:
    reader = csv.reader(fp, delimiter="\t")
    next(reader)
    for row in reader:
        quora.append(nltk.tokenize.word_tokenize(row[3].lower()))
print(quora[:10])

[['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india', '?'], ['what', 'is', 'the', 'story', 'of', 'kohinoor', '(', 'koh-i-noor', ')', 'diamond', '?'], ['how', 'can', 'i', 'increase', 'the', 'speed', 'of', 'my', 'internet', 'connection', 'while', 'using', 'a', 'vpn', '?'], ['why', 'am', 'i', 'mentally', 'very', 'lonely', '?', 'how', 'can', 'i', 'solve', 'it', '?'], ['which', 'one', 'dissolve', 'in', 'water', 'quikly', 'sugar', ',', 'salt', ',', 'methane', 'and', 'carbon', 'di', 'oxide', '?'], ['astrology', ':', 'i', 'am', 'a', 'capricorn', 'sun', 'cap', 'moon', 'and', 'cap', 'rising', '...', 'what', 'does', 'that', 'say', 'about', 'me', '?'], ['should', 'i', 'buy', 'tiago', '?'], ['how', 'can', 'i', 'be', 'a', 'good', 'geologist', '?'], ['when', 'do', 'you', 'use', 'シ', 'instead', 'of', 'し', '?'], ['motorola', '(', 'company', ')', ':', 'can', 'i', 'hack', 'my', 'charter', 'motorolla', 'dcx3400', '?']]

In [16]:quora_filtered = []
for sentence in quora:
    quora_filtered.append([w for w in sentence if w not in stop_words and len(w) > 1])
print(quora_filtered[:10])

[['step', 'step', 'guide', 'invest', 'share', 'market', 'india'], ['story', 'kohinoor', 'koh-i-noor', 'diamond'], ['increase', 'speed', 'internet', 'connection', 'using', 'vpn'], ['mentally', 'lonely', 'solve'], ['one', 'dissolve', 'water', 'quikly', 'sugar', 'salt', 'methane', 'carbon', 'di', 'oxide'], ['astrology', 'capricorn', 'sun', 'cap', 'moon', 'cap', 'rising', '...', 'say'], ['buy', 'tiago'], ['good', 'geologist'], ['use', 'instead'], ['motorola', 'company', 'hack', 'charter', 'motorolla', 'dcx3400']]


In [17]:model = gensim.models.Word2Vec(quora_filtered, size=50, window=5)

In [18]:model.wv.similar_by_word("sick")
 
[('dizzy', 0.8926801681518555),
 ('hungry', 0.8812119960784912),
 ('angry', 0.8726974129676819),
 ('tired', 0.8566402196884155),
 ('jealous', 0.8490884900093079),
 ('uncomfortable', 0.8480300903320312),
 ('bullied', 0.844206690788269),
 ('drunk', 0.8416849374771118),
 ('horny', 0.8360753059387207),
 ('rude', 0.8356759548187256)]

In [19]:model.wv.similar_by_vector("actor")
 
[('actress', 0.9272599220275879),
 ('singer', 0.907437801361084),
 ('bollywood', 0.8912582397460938),
 ('actors', 0.8538996577262878),
 ('remade', 0.8361212015151978),
 ('actor/actress', 0.8340005874633789),
 ('hollywood', 0.8316481113433838),
 ('films', 0.8285355567932129),
 ('superhero', 0.8265222907066345),
 ('comedy', 0.8245402574539185)]