Source code for octis.models.model

from abc import ABC, abstractmethod
import os
import numpy as np
import json


[docs]class AbstractModel(ABC): """ Class structure of a generic Topic Modeling implementation """ def __init__(self): """ Create a blank model to initialize """ self.hyperparameters = dict()
[docs] def set_hyperparameters(self, **kwargs): """ Set model hyperparameters :param **kwargs: a dictionary of in the form {hyperparameter name: value} """ for key, value in kwargs.items(): self.hyperparameters[key] = value
[docs] @abstractmethod def train_model(self, dataset, hyperparameters, top_words=10): """ Train the model. :param dataset: Dataset :param hyperparameters: dictionary in the form {hyperparameter name: value} :param top_words: number of top significant words for each topic (default: 10) :return model_output: a dictionary containing up to 4 keys: *topics*, *topic-word-matrix*, *topic-document-matrix*, *test-topic-document-matrix*. *topics* is the list of the most significant words for each topic (list of lists of strings). *topic-word-matrix* is the matrix (num topics x ||vocabulary||) containing the probabilities of a word in a given topic. *topic-document-matrix* is the matrix (||topics|| x ||training documents||) containing the probabilities of the topics in a given training document. *test-topic-document-matrix* is the matrix (||topics|| x ||testing documents||) containing the probabilities of the topics in a given testing document. """ pass
[docs]def save_model_output(model_output, path=os.curdir, appr_order=7): """ Saves the model output in the chosen directory :param model_output: output of the model :param path: path in which the file will be saved and name of the file :param appr_order: approximation order (used to round model_output values) """ to_save = {} try: for single_output in model_output.keys(): if single_output != "topics" and single_output != "test-topics": to_save[single_output] = ( model_output[single_output].round(appr_order)) else: to_save[single_output] = (model_output[single_output]) np.savez_compressed(path, **to_save) except: raise Exception("error in saving the output model file")
[docs]def load_model_output(output_path, vocabulary_path=None, top_words=10): """ Loads a model output from the choosen directory Parameters ---------- :param output_path: path in which th model output is saved :param vocabulary_path: path in which the vocabulary is saved (optional, used to retrieve the top k words of each topic) :param top_words: top k words to retrieve for each topic (in case a vocabulary path is given) """ output = dict(np.load(output_path, allow_pickle=True)) if vocabulary_path is not None: vocabulary_file = open(vocabulary_path, 'r') vocabulary = json.load(vocabulary_file) index2vocab = vocabulary topics_output = [] for topic in output["topic-word-matrix"]: top_k = np.argsort(topic)[-top_words:] top_k_words = list( reversed([[ index2vocab[str(i)], float(topic[i])] for i in top_k])) topics_output.append(top_k_words) output["topic-word-matrix"] = output["topic-word-matrix"].tolist() output["topic-document-matrix"] = output[ "topic-document-matrix"].tolist() if "test-topic-word-matrix" in output: output["test-topic-word-matrix"] = output[ "test-topic-word-matrix"].tolist() if "test-topic-document-matrix" in output: output["test-topic-document-matrix"] = output[ "test-topic-document-matrix"].tolist() output["topics"] = topics_output return output