Source code for tensor_lda.utils.sample_generator

"""Simulate the generative process of LDA and generate corpus based on it
"""

import numpy as np
from scipy.sparse import coo_matrix
from scipy.stats import poisson

from sklearn.utils import check_random_state
from sklearn.externals.six.moves import xrange


[docs]class LdaSampleGenerator(object): """Generate LDA samples Parameters ---------- n_topics : int Number of topics n_words : int Number of words in corpus min_doc_size : int Min word count in a document mean_doc_size : int Mean word count in a document doc_topic_prior : double Uniform Dirichlet prior of a document topic_word_prior : double Uniform Dirichlet prior of a topic mean_doc_size: int Mean Value if word count in each document Attributes ---------- topic_word_distr_ : array, [n_topics, n_words] Topic word distribution. """ def __init__(self, n_topics, n_words, min_doc_size, mean_doc_size, doc_topic_prior, topic_word_prior, random_state=None): self.n_topics = n_topics self.n_words = n_words self.min_doc_size = min_doc_size self.mean_doc_size = mean_doc_size self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior self.random_state = random_state self.random_state_ = check_random_state(self.random_state) # hidden variables self.topic_word_prior_ = np.repeat(topic_word_prior, n_words) # (n_topics, n_words) self.doc_topic_prior_ = np.repeat(self.doc_topic_prior, n_topics) self.topic_word_distr_ = self.random_state_.dirichlet( self.topic_word_prior_, n_topics)
[docs] def generate_documents(self, n_docs): """Generate Random doc-words Matrix Parameters ---------- n_docs : int number of documents Return ------ doc_word_mtx : sparse matrix, [n_docs, n_words] document words matrix """ rs = self.random_state_ n_topics = self.n_topics n_words = self.n_words docs_size = poisson.rvs(mu=(self.mean_doc_size - self.min_doc_size), size=n_docs, random_state=rs) docs_size += self.min_doc_size doc_prior = np.repeat(self.doc_topic_prior, n_topics) # (n_docs, n_topics) docs_distr = rs.dirichlet(doc_prior, n_docs) rows = [] cols = [] for i in xrange(n_docs): word_dist = np.dot(self.topic_word_distr_.T, docs_distr[i, :]) word_idx = rs.choice(n_words, docs_size[i], p=word_dist, replace=True) rows = np.append(rows, np.repeat(i, docs_size[i])) cols = np.append(cols, word_idx) data = np.ones(len(rows)) doc_word_mtx = coo_matrix((data, (rows, cols)), shape=(n_docs, n_words)).tocsr()
return docs_distr, doc_word_mtx