Skip to content

Instantly share code, notes, and snippets.

@maplpro
Created September 20, 2010 10:36
Show Gist options
  • Save maplpro/587718 to your computer and use it in GitHub Desktop.
Save maplpro/587718 to your computer and use it in GitHub Desktop.
indexing with .Net Lucene
import clr
clr.AddReference( "Lucene.Net.dll" )
from System.IO import FileInfo
idx_dir = FileInfo( "index" )
from Lucene.Net.Index import IndexWriter, IndexReader
from Lucene.Net.Analysis.Standard import StandardAnalyzer
from Lucene.Net.Analysis import StopFilter
import System
stop_words = ['bit.ly','from', 'maplpro', 'and', 'at', 'are', 'a', 'all', 'but', 'be',
'create', 'how', 'is', 'in', 'it', 'get', 'for', 'http', 'an', 'no', 'can',
'data', 'if', 'new', 'of', 'on', 'only', 'rt', 'that', 'the', 'to', 'services',
'service', 'server', 'with', 'you', 'your', 'id', 'file', 'windows', 'web', 'just',
'by', 'any', 'api', 'has', 'software', 'now' ]
index = IndexWriter( idx_dir, StandardAnalyzer( System.Array[str]( stop_words ) ), True )
def document(f):
from Lucene.Net.Documents import Document, Field
from System.IO import StreamReader
from System.Text import Encoding
document = Document()
document.Add(Field("contents", StreamReader(f.FullName, Encoding.Default), Field.TermVector.YES ))
return document
index.AddDocument( document( FileInfo( "activities.txt" ) ) )
index.Optimize()
index.Close()
reader = IndexReader.Open( idx_dir )
vectors = reader.GetTermFreqVectors(0)
terms = vectors[0].GetTerms()
print [(terms[x],y) for x,y in enumerate( vectors[0].GetTermFrequencies() ) if y > 3]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment