from octis.models.model import AbstractModel
import numpy as np
from gensim.models import ldamodel
import gensim.corpora as corpora
import octis.configuration.citations as citations
import octis.configuration.defaults as defaults
[docs]class LDA(AbstractModel):
id2word = None
id_corpus = None
use_partitions = True
update_with_test = False
def __init__(
self, num_topics=100, distributed=False, chunksize=2000,
passes=1, update_every=1, alpha="symmetric", eta=None, decay=0.5,
offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
random_state=None):
"""
Initialize LDA model
Parameters
----------
num_topics (int, optional) – The number of requested latent topics to
be extracted from the training corpus.
distributed (bool, optional) – Whether distributed computing should be
used to accelerate training.
chunksize (int, optional) – Number of documents to be used in each
training chunk.
passes (int, optional) – Number of passes through the corpus during
training.
update_every (int, optional) – Number of documents to be iterated
through for each update. Set to 0 for batch learning, > 1 for
online iterative learning.
alpha ({numpy.ndarray, str}, optional) – Can be set to an 1D array of
length equal to the number of expected topics that expresses our
a-priori belief for the each topics’ probability. Alternatively
default prior selecting strategies can be employed by supplying
a string:
’asymmetric’: Uses a fixed normalized asymmetric prior of
1.0 / topicno.
’auto’: Learns an asymmetric prior from the corpus
(not available if distributed==True).
eta ({float, np.array, str}, optional) – A-priori belief on word
probability, this can be:
scalar for a symmetric prior over topic/word probability,
vector of length num_words to denote an asymmetric user defined
probability for each word,
matrix of shape (num_topics, num_words) to assign a probability
for each word-topic combination,
the string ‘auto’ to learn the asymmetric prior from the data.
decay (float, optional) – A number between (0.5, 1] to weight what
percentage of the previous lambda value is forgotten when each new
document is examined.
offset (float, optional) – Hyper-parameter that controls how much
we will slow down the first steps the first few iterations.
eval_every (int, optional) – Log perplexity is estimated every
that many updates. Setting this to one slows down training by ~2x.
iterations (int, optional) – Maximum number of iterations through the
corpus when inferring the topic distribution of a corpus.
gamma_threshold (float, optional) – Minimum change in the value of the
gamma parameters to continue iterating.
random_state ({np.random.RandomState, int}, optional) – Either a
randomState object or a seed to generate one.s
Useful for reproducibility.
"""
super().__init__()
self.hyperparameters = dict()
self.hyperparameters["num_topics"] = num_topics
self.hyperparameters["distributed"] = distributed
self.hyperparameters["chunksize"] = chunksize
self.hyperparameters["passes"] = passes
self.hyperparameters["update_every"] = update_every
self.hyperparameters["alpha"] = alpha
self.hyperparameters["eta"] = eta
self.hyperparameters["decay"] = decay
self.hyperparameters["offset"] = offset
self.hyperparameters["eval_every"] = eval_every
self.hyperparameters["iterations"] = iterations
self.hyperparameters["gamma_threshold"] = gamma_threshold
self.hyperparameters["random_state"] = random_state
[docs] def info(self):
"""
Returns model informations
"""
return {
"citation": citations.models_LDA,
"name": "LDA, Latent Dirichlet Allocation"
}
[docs] def hyperparameters_info(self):
"""
Returns hyperparameters informations
"""
return defaults.LDA_hyperparameters_info
[docs] def set_hyperparameters(self, **kwargs):
"""
Set model hyperparameters
"""
super().set_hyperparameters(**kwargs)
# Allow alpha to be a float in case of symmetric alpha
if "alpha" in kwargs:
if isinstance(kwargs["alpha"], float):
self.hyperparameters["alpha"] = [
kwargs["alpha"]
] * self.hyperparameters["num_topics"]
[docs] def partitioning(self, use_partitions, update_with_test=False):
"""
Handle the partitioning system to use and reset the model to perform
new evaluations
Parameters
----------
use_partitions: True if train/set partitioning is needed, False
otherwise
update_with_test: True if the model should be updated with the test set,
False otherwise
"""
self.use_partitions = use_partitions
self.update_with_test = update_with_test
self.id2word = None
self.id_corpus = None
[docs] def train_model(self, dataset, hyperparams=None, top_words=10):
"""
Train the model and return output
Parameters
----------
dataset : dataset to use to build the model
hyperparams : hyperparameters to build the model
top_words : if greater than 0 returns the most significant words for
each topic in the output (Default True)
Returns
-------
result : dictionary with up to 3 entries,
'topics', 'topic-word-matrix' and
'topic-document-matrix'
"""
if hyperparams is None:
hyperparams = {}
if self.use_partitions:
train_corpus, test_corpus = dataset.get_partitioned_corpus(
use_validation=False)
else:
train_corpus = dataset.get_corpus()
if self.id2word is None:
self.id2word = corpora.Dictionary(dataset.get_corpus())
if self.id_corpus is None:
self.id_corpus = [self.id2word.doc2bow(document)
for document in train_corpus]
if "num_topics" not in hyperparams:
hyperparams["num_topics"] = self.hyperparameters["num_topics"]
# Allow alpha to be a float in case of symmetric alpha
if "alpha" in hyperparams:
if isinstance(hyperparams["alpha"], float):
hyperparams["alpha"] = [
hyperparams["alpha"]
] * hyperparams["num_topics"]
hyperparams["corpus"] = self.id_corpus
hyperparams["id2word"] = self.id2word
self.hyperparameters.update(hyperparams)
self.trained_model = ldamodel.LdaModel(**self.hyperparameters)
result = {}
result["topic-word-matrix"] = self.trained_model.get_topics()
if top_words > 0:
topics_output = []
for topic in result["topic-word-matrix"]:
top_k = np.argsort(topic)[-top_words:]
top_k_words = list(reversed([self.id2word[i] for i in top_k]))
topics_output.append(top_k_words)
result["topics"] = topics_output
result["topic-document-matrix"] = self._get_topic_document_matrix()
if self.use_partitions:
new_corpus = [self.id2word.doc2bow(
document) for document in test_corpus]
if self.update_with_test:
self.trained_model.update(new_corpus)
self.id_corpus.extend(new_corpus)
result["test-topic-word-matrix"] = (
self.trained_model.get_topics())
if top_words > 0:
topics_output = []
for topic in result["test-topic-word-matrix"]:
top_k = np.argsort(topic)[-top_words:]
top_k_words = list(
reversed([self.id2word[i] for i in top_k]))
topics_output.append(top_k_words)
result["test-topics"] = topics_output
result["test-topic-document-matrix"] = (
self._get_topic_document_matrix())
else:
test_document_topic_matrix = []
for document in new_corpus:
document_topics_tuples = self.trained_model[document]
document_topics = np.zeros(
self.hyperparameters["num_topics"])
for single_tuple in document_topics_tuples:
document_topics[single_tuple[0]] = single_tuple[1]
test_document_topic_matrix.append(document_topics)
result["test-topic-document-matrix"] = np.array(
test_document_topic_matrix).transpose()
return result
def _get_topics_words(self, topk):
"""
Return the most significative words for each topic.
"""
topic_terms = []
for i in range(self.hyperparameters["num_topics"]):
topic_words_list = []
for word_tuple in self.trained_model.get_topic_terms(i, topk):
topic_words_list.append(self.id2word[word_tuple[0]])
topic_terms.append(topic_words_list)
return topic_terms
def _get_topic_document_matrix(self):
"""
Return the topic representation of the
corpus
"""
doc_topic_tuples = []
for document in self.id_corpus:
doc_topic_tuples.append(
self.trained_model.get_document_topics(document,
minimum_probability=0))
topic_document = np.zeros((
self.hyperparameters["num_topics"],
len(doc_topic_tuples)))
for ndoc in range(len(doc_topic_tuples)):
document = doc_topic_tuples[ndoc]
for topic_tuple in document:
topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
return topic_document