LDA 코드
import random
import pandas as pd
from collections import Counter
K=4
documents = [["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]]
# a list of Counters, one for each document
document_topic_counts = [Counter() for _ in documents]
# a list of Counters, one for each topic
topic_word_counts = [Counter() for _ in range(K)]
# a list of numbers, one for each topic
topic_counts = [0 for _ in range(K)]
document_lengths = list(map(len, documents))
document_lengths
[7, 5, 6, 5, 4, 6, 4, 4, 4, 4, 3, 4, 3, 5, 3]
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)
W
36
D = len(documents)
D
15
def topic_weight(d, word, topic):
"""given a document and a word in that document,
return the weight for the kth topic"""
def p_topic_given_document(topic, d, alpha=0.1):
"""the fraction of words in document _d_
that are assigned to _topic_ (plus some smoothing)"""
return ((document_topic_counts[d][topic] + alpha) / (document_lengths[d] + K * alpha))
def p_word_given_topic(word, topic, beta=0.1):
"""the fraction of words assigned to _topic_
that equal _word_ (plus some smoothing)"""
return ((topic_word_counts[topic][word] + beta) / (topic_counts[topic] + W * beta))
return p_word_given_topic(word, topic) * p_topic_given_document(topic, d)
def choose_new_topic(d, word):
def sample_from(weights):
"""returns i with probability weights[i] / sum(weights)"""
total = sum(weights)
rnd = total * random.random() # uniform between 0 and total
for i, p in enumerate(weights):
rnd -= p # return the smallest i such that
if rnd <= 0:
return i # weights[0] + ... + weights[i] >= rnd
return sample_from([topic_weight(d, word, topic) for topic in range(K)])
document_topics = [[random.randrange(K) for word in document] for document in documents]
document_topics
[[3, 2, 3, 3, 2, 3, 1],
[3, 0, 0, 3, 3],
[3, 0, 2, 0, 0, 2],
[3, 2, 0, 0, 2],
[1, 1, 2, 2],
[0, 1, 0, 1, 0, 3],
[1, 1, 2, 1],
[1, 1, 3, 1],
[3, 2, 0, 1],
[2, 2, 3, 2],
[0, 3, 2],
[1, 3, 2, 0],
[0, 2, 2],
[0, 0, 3, 0, 3],
[1, 3, 1]]
for d in range(D):
for word, topic in zip(documents[d], document_topics[d]):
document_topic_counts[d][topic] += 1
topic_word_counts[topic][word] += 1
topic_counts[topic] += 1
document_topic_counts
[Counter({1: 1, 2: 2, 3: 4}),
Counter({0: 2, 3: 3}),
Counter({0: 3, 2: 2, 3: 1}),
Counter({0: 2, 2: 2, 3: 1}),
Counter({1: 2, 2: 2}),
Counter({0: 3, 1: 2, 3: 1}),
Counter({1: 3, 2: 1}),
Counter({1: 3, 3: 1}),
Counter({0: 1, 1: 1, 2: 1, 3: 1}),
Counter({2: 3, 3: 1}),
Counter({0: 1, 2: 1, 3: 1}),
Counter({0: 1, 1: 1, 2: 1, 3: 1}),
Counter({0: 1, 2: 2}),
Counter({0: 3, 3: 2}),
Counter({1: 2, 3: 1})]
Training
for epoch in range(1000): # repetition
for d in range(D): # each documnet
for i, (word, topic) in enumerate(zip(documents[d],document_topics[d])):
# gibbs sampling: 특정 하나의 topic assignment z를 제거하고 나머지들(-z)의 조건부 확률
# remove this word / topic from the counts
# so that it doesn't influence the weights
document_topic_counts[d][topic] -= 1 # 문서별 토픽 갯수
topic_word_counts[topic][word] -= 1 # 토픽별 단어 갯수
topic_counts[topic] -= 1 # 토픽별 카운트
document_lengths[d] -= 1 # 문서별 단어갯수
# choose a new topic based on the weights
new_topic = choose_new_topic(d, word)
document_topics[d][i] = new_topic
# and now add it back to the counts
document_topic_counts[d][new_topic] += 1 # 문서별 토픽 갯수
topic_word_counts[new_topic][word] += 1 # 토픽별 단어 갯수
topic_counts[new_topic] += 1 # 토픽별 카운트
document_lengths[d] += 1 # 문서별 단어갯수
topic_word_counts
: 각 토픽에 대한 단어 분포를 이용해 결과 확인df = pd.DataFrame(columns=['Topic1','Topic2','Topic3','Topic4'], index=['Top'+str(i) for i in range(1,6)])
for k, word_counts in enumerate(topic_word_counts):
for ix, (word, count) in enumerate(word_counts.most_common(5)): # 각 토픽별로 top 10 단어
df.loc['Top'+str(ix+1),'Topic'+str(k+1)] = word+'({})'.format(count)
print(df)
Topic1 Topic2 Topic3 \
Top1 Python(4) HBase(3) probability(3)
Top2 R(4) Postgres(2) statistics(3)
Top3 regression(3) MongoDB(2) neural networks(2)
Top4 pandas(2) Cassandra(1) artificial intelligence(2)
Top5 libsvm(2) MySQL(1) deep learning(2)
Topic4
Top1 Big Data(3)
Top2 Java(3)
Top3 Hadoop(2)
Top4 Cassandra(1)
Top5 machine learning(1)
Reference