项目作者: cuteboydot

项目描述 :
Implementation of LDA for documents clustering using Gibbs sampling.
高级语言: Python
项目地址: git://github.com/cuteboydot/Latent-Dirichlet-Allocation.git
创建时间: 2017-06-06T08:49:21Z
项目社区:https://github.com/cuteboydot/Latent-Dirichlet-Allocation

开源协议:

下载


Latent-Dirichlet-Allocation

Implementation of LDA using Gibbs sampling
cuteboydot@gmail.com

reference : https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

LDA : Topic modeling






Example : 6 docs, 2 topics








  • code
    ```python
    import numpy as np
    import random

np.set_printoptions(precision=4)
np.set_printoptions(suppress=True)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(linewidth=50)

alpha = 0.1
beta = 0.001
topics = 2
epoch = 30

docs = np.array((“칼로리 레시피 서비스 식재료 먹거리”,
“도시락 건강식 다이어트 칼로리 레시피”,
“마케팅 다이어트 식재료 배송 칼로리”,
“여행 YOLO 혼술 휴가 연휴”,
“여행 예약 항공권 마케팅 연휴”,
“항공권 예약 호텔 다구간 서비스”))

words_full = []
words_uniq = []
doc_word = np.zeros((docs.shape[0]))
doc_words_size = np.zeros((docs.shape[0]))
a = 0
for doc in docs:
doc_words = doc.split()
words_full += doc_words
doc_words_size[a] = len(doc_words)
a += 1
words_full = np.array(words_full)
print (“words_full”)
print (words_full)

words = np.array(list(set(words_full)))
words_uniq = np.unique(words_full)
words_uniq = np.reshape(words_uniq, (words_uniq.shape[0]))
print (“words_uniq”)
print (words_uniq)

word, doc num, topic num, unique word index

word_doc_topic = np.array([‘keyword’, 0, 0, 0])
a=0
for doc in docs:
words = doc.split()
for word in words:
id_uniq = np.where(words_uniq == word)[0]
to = random.randrange(0, topics)
element = (word, a, to, id_uniq[0])
word_doc_topic = np.vstack((word_doc_topic, element))
a += 1
word_doc_topic = word_doc_topic[1:, :]
print (“word_doc_topic”)
print (word_doc_topic)

theta_num = np.zeros((docs.shape[0], topics))
theta_prob = np.zeros((docs.shape[0], topics))
phi_num = np.zeros((words_uniq.shape[0], topics))
phi_prob = np.zeros((words_uniq.shape[0], topics))

def gibbs_proc(word_doc_topic_task, sample, idx):

  1. # make topic-doc relation
  2. for a in range(docs.shape[0]):
  3. for b in range(topics):
  4. count = np.count_nonzero((word_doc_topic_task[:, 1] == str(a)) & (word_doc_topic_task[:, 2] == str(b)))
  5. theta_num[a][b] = count + alpha
  6. for a in range(docs.shape[0]):
  7. for b in range(topics):
  8. count = np.sum(theta_num[a])
  9. theta_prob[a][b] = float(theta_num[a][b])/float(count)
  10. # make word-topic relation
  11. for a in range(words_uniq.shape[0]):
  12. for b in range(topics):
  13. count = np.count_nonzero((word_doc_topic_task[:, 0] == str(words_uniq[a])) & (word_doc_topic_task[:, 2] == str(b)))
  14. phi_num[a][b] = count + beta
  15. for a in range(words_uniq.shape[0]):
  16. for b in range(topics):
  17. count = np.sum(phi_num[a])
  18. phi_prob[a][b] = float(phi_num[a][b])/float(count)
  19. del word_doc_topic_task
  20. # allocate topic-word
  21. # sample [word, doc num, topic num, word uniq idx]
  22. if idx >= 0 :
  23. p_post = np.zeros((topics))
  24. for a in range(topics):
  25. p_topic_doc = theta_prob[int(sample[1])][a]
  26. topic_tot = np.sum((phi_num.T)[a])
  27. p_word_topic = phi_num[int(sample[3])][a]/topic_tot
  28. p_post[a] = p_topic_doc * p_word_topic
  29. topic_max = np.argmax(p_post)
  30. return topic_max

if name == “main“:

  1. # do gibbs sampling proc
  2. for a in range(epoch):
  3. for b in range(word_doc_topic.shape[0]):
  4. word_doc_topic_task = word_doc_topic.copy()
  5. sample = word_doc_topic_task[b]
  6. word_doc_topic_task = np.delete(word_doc_topic_task, b, axis=0)
  7. topic_max = gibbs_proc(word_doc_topic_task, sample, b)
  8. word_doc_topic[b][2] = topic_max
  9. del word_doc_topic_task
  10. # print final state
  11. gibbs_proc(word_doc_topic, [None, None, None, None], -1)
  12. print ("theta P(Topic;Doc)")
  13. for a in range(theta_num.shape[0]) :
  14. print ("Doc%d => %s = %s" % (a, str(theta_num[a]), str(theta_prob[a])))
  15. print ("phi P(Word;Topic)")
  16. for a in range(phi_num.shape[0]) :
  17. print ("%s => %s = %s" % (words_uniq[a], str(phi_num[a]), str(phi_prob[a])))
  18. print ("word_doc_topic")
  19. print (word_doc_topic)

```