I like to build an app where I build an LDA model offline periodically by Amazon EMR/Hadoop, and I make a document/topic inference for a new document online.
I read a post/a reply about using LDA CVB model to match a new doc to topics (http://tinyurl.com/mef4czr) I have some questions about using LDA CVB TopicModel class which isn't well documented: Q: how many iterations are good? and why? Q: do we get model mutated by training w/ new doc? why? Q: what is inferred in this program? how to use that infer() method? and Q: Has anyone seen good example usages/sample code of doing this kind of task? Thanks, Henry Lee. See my code below: @Testpublic void testOfJakeMannixIdeaAndQuestions() { // [email protected] val conf = new Configuration(); val dictionary = readDictionary(new Path("/tmp/dictionary.file-0"), conf); assertThat(dictionary.length, equalTo(41807)); // tfidf_vector represents a document in RandomAccessSparseVector. val tfidf_vector = readTFVectorsInRange(new Path("/tmp/tfidf-vectors"), conf, 0, 1)[0].getSecond(); assertThat(tfidf_vector.size(), equalTo(41807)); // reads 'model' dense matrix (20 x 41K), and in 'topicSum' dense vector. TopicModel model = readModel(dictionary, new Path("/tmp/reuters-lda-model-splits"), conf); assertThat(model.getNumTopics(), equalTo(20)); assertThat(model.getNumTerms(), equalTo(41807)); val doc = tfidf_vector; Vector docTopics = new DenseVector(new double[model.getNumTopics()]).assign(1.0/model.getNumTopics()); Matrix docTopicModel = new SparseRowMatrix(model.getNumTopics(), doc.size()); // Q: How many iterations are good? Why? for (int i = 0; i < 100 /* maxItrs */; i++) { model.trainDocTopicModel(doc, docTopics, docTopicModel); System.out.println(docTopics.toString()); // Q: Do you think that 'model' got mutated, or not? why? } Vector inferred = model.infer(doc, docTopics); System.out.println(inferred); // Q: What is this inferred? How can I use it?} @SneakyThrows({ IOException.class })private static Pair<String, Vector>[] readTFVectorsInRange(Path path, Configuration conf, int offset, int length) { val seq = new SequenceFile.Reader(FileSystem.get(conf), path, conf); val documentName = new Text(); @SuppressWarnings("unchecked") Pair<String, Vector>[] vectors = new Pair[length]; VectorWritable vector = new VectorWritable(); for (int i = 0; i < offset + length && seq.next(documentName, vector); i++) { if (i >= offset) { vectors[i - offset] = Pair.of(documentName.toString(), vector.get()); } } return vectors;} @SneakyThrows({ IOException.class })private static TopicModel readModel(String[] dictionary, Path path, Configuration conf) { double alpha = 0.0001; // default: doc-topic smoothing double eta = 0.0001; // default: term-topic smoothing double modelWeight = 1f; return new TopicModel(conf, eta, alpha, dictionary, 1, modelWeight, listModelPath(path, conf));} @SneakyThrows({ IOException.class })private static Path[] listModelPath(Path path, Configuration conf) { if (FileSystem.get(conf).isFile(path)) { return new Path[] { path }; } else { val statuses = FileSystem.get(conf).listStatus(path, PathFilters.partFilter()); val modelPaths = new Path[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelPaths[i] = new Path(statuses[i].getPath().toUri().toString()); } return modelPaths; }} @SneakyThrows({ IOException.class })private static String[] readDictionary(Path path, Configuration conf) { val term = new Text(); val id = new IntWritable(); val reader = new SequenceFile.Reader(FileSystem.get(conf), path, conf); val termIds = ImmutableList.<Pair<String, Integer>>builder(); int maxId = 0; while (reader.next(term, id)) { termIds.add(Pair.of(term.toString(), id.get())); maxId = max(maxId, id.get()); } String[] terms = new String[maxId + 1]; for (val termId : termIds.build()) { terms[termId.getSecond().intValue()] = termId.getFirst().toString(); } return terms;}
