Created
September 20, 2010 10:36
-
-
Save maplpro/587718 to your computer and use it in GitHub Desktop.
indexing with .Net Lucene
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import clr | |
clr.AddReference( "Lucene.Net.dll" ) | |
from System.IO import FileInfo | |
idx_dir = FileInfo( "index" ) | |
from Lucene.Net.Index import IndexWriter, IndexReader | |
from Lucene.Net.Analysis.Standard import StandardAnalyzer | |
from Lucene.Net.Analysis import StopFilter | |
import System | |
stop_words = ['bit.ly','from', 'maplpro', 'and', 'at', 'are', 'a', 'all', 'but', 'be', | |
'create', 'how', 'is', 'in', 'it', 'get', 'for', 'http', 'an', 'no', 'can', | |
'data', 'if', 'new', 'of', 'on', 'only', 'rt', 'that', 'the', 'to', 'services', | |
'service', 'server', 'with', 'you', 'your', 'id', 'file', 'windows', 'web', 'just', | |
'by', 'any', 'api', 'has', 'software', 'now' ] | |
index = IndexWriter( idx_dir, StandardAnalyzer( System.Array[str]( stop_words ) ), True ) | |
def document(f): | |
from Lucene.Net.Documents import Document, Field | |
from System.IO import StreamReader | |
from System.Text import Encoding | |
document = Document() | |
document.Add(Field("contents", StreamReader(f.FullName, Encoding.Default), Field.TermVector.YES )) | |
return document | |
index.AddDocument( document( FileInfo( "activities.txt" ) ) ) | |
index.Optimize() | |
index.Close() | |
reader = IndexReader.Open( idx_dir ) | |
vectors = reader.GetTermFreqVectors(0) | |
terms = vectors[0].GetTerms() | |
print [(terms[x],y) for x,y in enumerate( vectors[0].GetTermFrequencies() ) if y > 3] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment