.. _sphx_glr_auto_examples_tensor_lda_with_20_news_group.py:


================================
Topic extraction with Tensor LDA
================================

This example is modified from scikit-learn's "Topic extraction with
Non-negative Matrix Factorization and Latent Dirichlet Allocation"
example.

This example applies :class:`tensor_lda.tensor_lda.TensorLDA`
on the 20 news group dataset and the output is a list of topics, each
represented as a list of terms (weights are not shown).


.. code-block:: python


    from __future__ import print_function
    from time import time

    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.datasets import fetch_20newsgroups

    from tensor_lda.tensor_lda import TensorLDA

    n_samples = 10000
    n_features = 1000
    n_components = 40
    n_top_words = 10


    def print_top_words(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            topic_prior = model.alpha_[topic_idx]
            message = "Topic #%d (prior: %.3f): " % (topic_idx, topic_prior)
            message += " ".join([feature_names[i]
                                 for i in topic.argsort()[:-n_top_words - 1:-1]])
            print(message)
        print()


    # Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
    # to filter out useless terms early on: the posts are stripped of headers,
    # footers and quoted replies, and common English words, words occurring in
    # only one document or in at least 95% of the documents are removed.

    print("Loading dataset...")
    t0 = time()
    dataset = fetch_20newsgroups(shuffle=True, random_state=2,
                                 remove=('headers', 'footers', 'quotes'))
    data_samples = dataset.data[:n_samples]
    print("done in %0.3fs." % (time() - t0))

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.8, min_df=5,
                                    max_features=n_features,
                                    stop_words='english')
    t0 = time()
    tf = tf_vectorizer.fit_transform(data_samples)
    print("done in %0.3fs." % (time() - t0))
    print()

    print("Fitting TensorLDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))

    lda = TensorLDA(n_components=n_components, alpha0=.1)

    t0 = time()
    lda.fit(tf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)

    doc_topics = lda.transform(tf[0:2, :])
    print(doc_topics[0, :])
    print(data_samples[0])

**Total running time of the script:** ( 0 minutes  0.000 seconds)


.. only :: html

 .. container:: sphx-glr-footer


  .. container:: sphx-glr-download

     :download:`Download Python source code: tensor_lda_with_20_news_group.py <tensor_lda_with_20_news_group.py>`


  .. container:: sphx-glr-download

     :download:`Download Jupyter notebook: tensor_lda_with_20_news_group.ipynb <tensor_lda_with_20_news_group.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.readthedocs.io>`_