Source code for octis.models.model

from abc import ABC, abstractmethod
import os
import numpy as np
import json


[docs]class AbstractModel(ABC):
    """
    Class structure of a generic Topic Modeling implementation
    """

    def __init__(self):
        """
        Create a blank model to initialize
        """
        self.hyperparameters = dict()

[docs]    def set_hyperparameters(self, **kwargs):
        """
        Set model hyperparameters

        :param **kwargs: a dictionary of in the form {hyperparameter name: value}
        """
        for key, value in kwargs.items():
            self.hyperparameters[key] = value

[docs]    @abstractmethod
    def train_model(self, dataset, hyperparameters, top_words=10):
        """
        Train the model.
        :param dataset: Dataset
        :param hyperparameters: dictionary in the form {hyperparameter name: value}
        :param top_words: number of top significant words for each topic (default: 10)

        :return model_output: a dictionary containing up to 4 keys: *topics*, *topic-word-matrix*,
        *topic-document-matrix*, *test-topic-document-matrix*. *topics* is the list of the most significant words for
        each topic (list of lists of strings). *topic-word-matrix* is the matrix (num topics x ||vocabulary||)
        containing  the probabilities of a word in a given topic. *topic-document-matrix* is the matrix (||topics|| x
        ||training documents||) containing the probabilities of the topics in a given training document.
        *test-topic-document-matrix* is the matrix (||topics|| x ||testing documents||) containing the probabilities
        of the topics in a given testing document.
        """
        pass


[docs]def save_model_output(model_output, path=os.curdir, appr_order=7):
    """
    Saves the model output in the chosen directory

    :param model_output: output of the model
    :param path: path in which the file will be saved and name of the file
    :param appr_order: approximation order (used to round model_output values)
    """

    to_save = {}
    try:
        for single_output in model_output.keys():
            if single_output != "topics" and single_output != "test-topics":
                to_save[single_output] = (
                    model_output[single_output].round(appr_order))
            else:
                to_save[single_output] = (model_output[single_output])
        np.savez_compressed(path, **to_save)
    except:
        raise Exception("error in saving the output model file")


[docs]def load_model_output(output_path, vocabulary_path=None, top_words=10):
    """
    Loads a model output from the choosen directory

    Parameters
    ----------
    :param output_path: path in which th model output is saved
    :param vocabulary_path: path in which the vocabulary is saved (optional,
     used to retrieve the top k words of each topic)
    :param top_words: top k words to retrieve for each topic (in case a
     vocabulary path is given)
    """
    output = dict(np.load(output_path, allow_pickle=True))
    if vocabulary_path is not None:
        vocabulary_file = open(vocabulary_path, 'r')
        vocabulary = json.load(vocabulary_file)
        index2vocab = vocabulary

        topics_output = []
        for topic in output["topic-word-matrix"]:
            top_k = np.argsort(topic)[-top_words:]
            top_k_words = list(
                reversed([[
                    index2vocab[str(i)], float(topic[i])] for i in top_k]))
            topics_output.append(top_k_words)

        output["topic-word-matrix"] = output["topic-word-matrix"].tolist()
        output["topic-document-matrix"] = output[
            "topic-document-matrix"].tolist()
        if "test-topic-word-matrix" in output:
            output["test-topic-word-matrix"] = output[
                "test-topic-word-matrix"].tolist()
        if "test-topic-document-matrix" in output:
            output["test-topic-document-matrix"] = output[
                "test-topic-document-matrix"].tolist()

        output["topics"] = topics_output
    return output