Thursday, May 25, 2017

Open Source: Probabilistic Latent Sementic Analysis (pLSA) implementation in Java

java-plsa

Package provides the java implementation of probabilistic latent semantic analysis (pLSA)
Build Status Coverage Status

Install

Add the following dependency to your POM file:
<dependency>
  <groupId>com.github.chen0040</groupId>
  <artifactId>java-plsa</artifactId>
  <version>1.0.1</version>
</dependency>

Usage

The sample code belows illustrates how to perform topic modelling using pLSA
List<String> docs = Arrays.asList("[doc-1-content]", "[doc-2-content]", ...);

pLSA method = new pLSA();
method.setStemmerEnabled(true);

method.setMaxIters(10);
method.setMaxVocabularySize(1000);
method.fit(docs);

for(int topic = 0; topic < method.getTopicCount(); ++topic){
 List<TupleTwo<Document, Double>> topRankedDocs = method.getTopRankingDocs4Topic(topic, 3);
 List<TupleTwo<String, Double>> topRankedWords = method.getTopRankingWords4Topic(topic, 3);

 System.out.println("Topic "+topic+": ");

 System.out.println("Top Ranked Document:");
 for(TupleTwo<Document, Double> entry : topRankedDocs){
    Document doc = entry._1();
    double score = entry._2();
    System.out.print(doc.docIndex()+"(" + score +"), ");
    System.out.println(doc.content());
 }
 System.out.println();

 System.out.println("Top Ranked Words:");
 for(TupleTwo<String, Double> entry : topRankedWords){
    String word = entry._1();
    double score = entry._2();
    System.out.print(word+"(" + score +"), ");
 }
 System.out.println();
}

System.out.println("// ============================================= //");

for(int doc = 0; doc < method.getDocCount(); ++doc){
 List<TupleTwo<Integer, Double>> topRankedTopics = method.getTopRankingTopics4Doc(doc, 3);
 System.out.print("Doc "+doc+": ");
 for(TupleTwo<Integer, Double> entry : topRankedTopics){
    int topic = entry._1();
    double score = entry._2();
    System.out.print(topic+"(" + score + "), ");
 }
 System.out.println();
}

No comments:

Post a Comment