Source code for octis.models.LDA

from octis.models.model import AbstractModel
import numpy as np
from gensim.models import ldamodel
import gensim.corpora as corpora
import octis.configuration.citations as citations
import octis.configuration.defaults as defaults


[docs]class LDA(AbstractModel): id2word = None id_corpus = None use_partitions = True update_with_test = False def __init__( self, num_topics=100, distributed=False, chunksize=2000, passes=1, update_every=1, alpha="symmetric", eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, random_state=None): """ Initialize LDA model Parameters ---------- num_topics (int, optional) – The number of requested latent topics to be extracted from the training corpus. distributed (bool, optional) – Whether distributed computing should be used to accelerate training. chunksize (int, optional) – Number of documents to be used in each training chunk. passes (int, optional) – Number of passes through the corpus during training. update_every (int, optional) – Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning. alpha ({numpy.ndarray, str}, optional) – Can be set to an 1D array of length equal to the number of expected topics that expresses our a-priori belief for the each topics’ probability. Alternatively default prior selecting strategies can be employed by supplying a string: ’asymmetric’: Uses a fixed normalized asymmetric prior of 1.0 / topicno. ’auto’: Learns an asymmetric prior from the corpus (not available if distributed==True). eta ({float, np.array, str}, optional) – A-priori belief on word probability, this can be: scalar for a symmetric prior over topic/word probability, vector of length num_words to denote an asymmetric user defined probability for each word, matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination, the string ‘auto’ to learn the asymmetric prior from the data. decay (float, optional) – A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined. offset (float, optional) – Hyper-parameter that controls how much we will slow down the first steps the first few iterations. eval_every (int, optional) – Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x. iterations (int, optional) – Maximum number of iterations through the corpus when inferring the topic distribution of a corpus. gamma_threshold (float, optional) – Minimum change in the value of the gamma parameters to continue iterating. random_state ({np.random.RandomState, int}, optional) – Either a randomState object or a seed to generate one.s Useful for reproducibility. """ super().__init__() self.hyperparameters = dict() self.hyperparameters["num_topics"] = num_topics self.hyperparameters["distributed"] = distributed self.hyperparameters["chunksize"] = chunksize self.hyperparameters["passes"] = passes self.hyperparameters["update_every"] = update_every self.hyperparameters["alpha"] = alpha self.hyperparameters["eta"] = eta self.hyperparameters["decay"] = decay self.hyperparameters["offset"] = offset self.hyperparameters["eval_every"] = eval_every self.hyperparameters["iterations"] = iterations self.hyperparameters["gamma_threshold"] = gamma_threshold self.hyperparameters["random_state"] = random_state
[docs] def info(self): """ Returns model informations """ return { "citation": citations.models_LDA, "name": "LDA, Latent Dirichlet Allocation" }
[docs] def hyperparameters_info(self): """ Returns hyperparameters informations """ return defaults.LDA_hyperparameters_info
[docs] def set_hyperparameters(self, **kwargs): """ Set model hyperparameters """ super().set_hyperparameters(**kwargs) # Allow alpha to be a float in case of symmetric alpha if "alpha" in kwargs: if isinstance(kwargs["alpha"], float): self.hyperparameters["alpha"] = [ kwargs["alpha"] ] * self.hyperparameters["num_topics"]
[docs] def partitioning(self, use_partitions, update_with_test=False): """ Handle the partitioning system to use and reset the model to perform new evaluations Parameters ---------- use_partitions: True if train/set partitioning is needed, False otherwise update_with_test: True if the model should be updated with the test set, False otherwise """ self.use_partitions = use_partitions self.update_with_test = update_with_test self.id2word = None self.id_corpus = None
[docs] def train_model(self, dataset, hyperparams=None, top_words=10): """ Train the model and return output Parameters ---------- dataset : dataset to use to build the model hyperparams : hyperparameters to build the model top_words : if greater than 0 returns the most significant words for each topic in the output (Default True) Returns ------- result : dictionary with up to 3 entries, 'topics', 'topic-word-matrix' and 'topic-document-matrix' """ if hyperparams is None: hyperparams = {} if self.use_partitions: train_corpus, test_corpus = dataset.get_partitioned_corpus( use_validation=False) else: train_corpus = dataset.get_corpus() if self.id2word is None: self.id2word = corpora.Dictionary(dataset.get_corpus()) if self.id_corpus is None: self.id_corpus = [self.id2word.doc2bow(document) for document in train_corpus] if "num_topics" not in hyperparams: hyperparams["num_topics"] = self.hyperparameters["num_topics"] # Allow alpha to be a float in case of symmetric alpha if "alpha" in hyperparams: if isinstance(hyperparams["alpha"], float): hyperparams["alpha"] = [ hyperparams["alpha"] ] * hyperparams["num_topics"] hyperparams["corpus"] = self.id_corpus hyperparams["id2word"] = self.id2word self.hyperparameters.update(hyperparams) self.trained_model = ldamodel.LdaModel(**self.hyperparameters) result = {} result["topic-word-matrix"] = self.trained_model.get_topics() if top_words > 0: topics_output = [] for topic in result["topic-word-matrix"]: top_k = np.argsort(topic)[-top_words:] top_k_words = list(reversed([self.id2word[i] for i in top_k])) topics_output.append(top_k_words) result["topics"] = topics_output result["topic-document-matrix"] = self._get_topic_document_matrix() if self.use_partitions: new_corpus = [self.id2word.doc2bow( document) for document in test_corpus] if self.update_with_test: self.trained_model.update(new_corpus) self.id_corpus.extend(new_corpus) result["test-topic-word-matrix"] = ( self.trained_model.get_topics()) if top_words > 0: topics_output = [] for topic in result["test-topic-word-matrix"]: top_k = np.argsort(topic)[-top_words:] top_k_words = list( reversed([self.id2word[i] for i in top_k])) topics_output.append(top_k_words) result["test-topics"] = topics_output result["test-topic-document-matrix"] = ( self._get_topic_document_matrix()) else: test_document_topic_matrix = [] for document in new_corpus: document_topics_tuples = self.trained_model[document] document_topics = np.zeros( self.hyperparameters["num_topics"]) for single_tuple in document_topics_tuples: document_topics[single_tuple[0]] = single_tuple[1] test_document_topic_matrix.append(document_topics) result["test-topic-document-matrix"] = np.array( test_document_topic_matrix).transpose() return result
def _get_topics_words(self, topk): """ Return the most significative words for each topic. """ topic_terms = [] for i in range(self.hyperparameters["num_topics"]): topic_words_list = [] for word_tuple in self.trained_model.get_topic_terms(i, topk): topic_words_list.append(self.id2word[word_tuple[0]]) topic_terms.append(topic_words_list) return topic_terms def _get_topic_document_matrix(self): """ Return the topic representation of the corpus """ doc_topic_tuples = [] for document in self.id_corpus: doc_topic_tuples.append( self.trained_model.get_document_topics(document, minimum_probability=0)) topic_document = np.zeros(( self.hyperparameters["num_topics"], len(doc_topic_tuples))) for ndoc in range(len(doc_topic_tuples)): document = doc_topic_tuples[ndoc] for topic_tuple in document: topic_document[topic_tuple[0]][ndoc] = topic_tuple[1] return topic_document