Source code for octis.models.NMF_scikit

from octis.models.model import AbstractModel
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import octis.configuration.defaults as defaults


[docs]class NMF_scikit(AbstractModel):

    def __init__(
        self, num_topics=100, init=None, alpha=0, l1_ratio=0,
            regularization='both', use_partitions=True):
        """
        Initialize NMF model

        Parameters
        ----------
        num_topics (int) – Number of topics to extract.

        init (string, optional) – Method used to initialize the procedure.
        Default: None. Valid options:

            None: ‘nndsvd’ if n_components <= min(n_samples, n_features),
            otherwise random.

            ‘random’: non-negative random matrices, scaled with:
            sqrt(X.mean() / n_components)

            ‘nndsvd’: Nonnegative Double Singular Value Decomposition (NNDSVD)
            initialization (better for sparseness)

            ‘nndsvda’: NNDSVD with zeros filled with the average of X
            (better when sparsity is not desired)

            ‘nndsvdar’: NNDSVD with zeros filled with small random values
            (generally faster, less accurate alternative to NNDSVDa for when
            sparsity is not desired)

        alpha (double, optional) – Constant that multiplies the regularization
        terms. Set it to zero to have no regularization.

        l1_ratio (double, optional) – The regularization mixing parameter, with
        0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an elementwise
        L2 penalty (aka Frobenius Norm). For l1_ratio = 1 it is an
        elementwise L1 penalty. For 0 < l1_ratio < 1, the penalty
        is a combination of L1 and L2.
        """
        super().__init__()
        self.hyperparameters["num_topics"] = num_topics
        self.hyperparameters["init"] = init
        self.hyperparameters["alpha"] = alpha
        self.hyperparameters["l1_ratio"] = l1_ratio
        self.hyperparameters['regularization'] = regularization
        self.use_partitions = use_partitions

        self.id2word = None
        self.id_corpus = None
        self.update_with_test = False

[docs]    def hyperparameters_info(self):
        """
        Returns hyperparameters informations
        """
        return defaults.NMF_scikit_hyperparameters_info

[docs]    def partitioning(self, use_partitions, update_with_test=False):
        """
        Handle the partitioning system to use and reset the model to perform
        new evaluations

        Parameters
        ----------
        use_partitions: True if train/set partitioning is needed, False
                        otherwise
        update_with_test: True if the model should be updated with the test set,
                          False otherwise
        """
        self.use_partitions = use_partitions
        self.update_with_test = update_with_test
        self.id2word = None
        self.id_corpus = None

[docs]    def train_model(self, dataset, hyperparameters=None, top_words=10):
        """
        Train the model and return output

        Parameters
        ----------
        dataset : dataset to use to build the model
        hyperparameters : hyperparameters to build the model
        top_words : if greather than 0 returns the most significant words
                 for each topic in the output
                 Default True


        Returns
        -------
        result : dictionary with up to 3 entries,
                 'topics', 'topic-word-matrix' and
                 'topic-document-matrix'
        """
        if hyperparameters is None:
            hyperparameters = {}

        if self.id2word is None or self.id_corpus is None:
            vectorizer = TfidfVectorizer(
                min_df=0.0, token_pattern=r"(?u)\b[\w|\-]+\b",
                vocabulary=dataset.get_vocabulary())

            if self.use_partitions:
                partition = dataset.get_partitioned_corpus(
                    use_validation=False)
                corpus = partition[0]
            else:
                corpus = dataset.get_corpus()

            real_corpus = [" ".join(document) for document in corpus]
            X = vectorizer.fit_transform(real_corpus)

            self.id2word = {i: k for i, k in enumerate(
                vectorizer.get_feature_names())}
            if self.use_partitions:
                test_corpus = []
                for document in partition[1]:
                    test_corpus.append(" ".join(document))
                Y = vectorizer.transform(test_corpus)
                self.id_corpus = X
                self.new_corpus = Y
            else:
                self.id_corpus = X

        #hyperparameters["corpus"] = self.id_corpus
        #hyperparameters["id2word"] = self.id2word
        self.hyperparameters.update(hyperparameters)
        model = NMF(
            n_components=self.hyperparameters["num_topics"],
            init=self.hyperparameters["init"],
            alpha=self.hyperparameters["alpha"],
            l1_ratio=self.hyperparameters["l1_ratio"],
            regularization=self.hyperparameters['regularization'])

        W = model.fit_transform(self.id_corpus)
        #W = W / W.sum(axis=1, keepdims=True)
        H = model.components_
        #H = H / H.sum(axis=1, keepdims=True)

        result = {}

        result["topic-word-matrix"] = H

        if top_words > 0:
            result["topics"] = self.get_topics(H, top_words)

        result["topic-document-matrix"] = np.array(W).transpose()

        if self.use_partitions:
            if self.update_with_test:
               # NOT IMPLEMENTED YET

                result["test-topic-word-matrix"] = W

                if top_words > 0:
                    result["test-topics"] = self.get_topics(W, top_words)

                result["test-topic-document-matrix"] = H

            else:
                result["test-topic-document-matrix"] = model.transform(
                    self.new_corpus).T

        return result

    def get_topics(self, H, top_words):
        topic_list = []
        for topic in H:
            words_list = sorted(
                list(enumerate(topic)), key=lambda x: x[1])
            topk = [tup[0] for tup in words_list[0:top_words]]
            topic_list.append([self.id2word[i] for i in topk])
        return topic_list