Last active
August 8, 2016 11:27
-
-
Save semihamakinist/a4bedb1d2ff37bf8f92d910a72371e8d to your computer and use it in GitHub Desktop.
solrSimServer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Jul 19 12:04:03 2016 | |
@author: Semiha | |
""" | |
""" | |
how to run with indexing data on Solr. This code, it is answer. | |
""" | |
from simserver import SessionServer | |
from gensim import utils | |
import os, io, sys | |
from nltk.corpus import stopwords | |
import solr | |
reload(sys) | |
sys.setdefaultencoding('utf-8') #cp1254 | |
import logging | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) | |
server = SessionServer('sessionSolrServer') # resume server (or create a new one) | |
def solrReadFile(start=0): | |
corpusList=[] | |
url="http://127.0.0.1:8983/solr/core1"#localhost/ip:port | |
conn=solr.SolrConnection(url=url) | |
response=conn.query('*', rows =1)#for finding the total number of data on the solr | |
searched=conn.query('*', rows =response.numFound) | |
i=start | |
for result in searched.results: | |
try: | |
categories=result["categories"][0].lower()#solr file key field | |
text_tmp=result["features"][0].lower().split("\n") #solr file key field for text | |
sentences=[] | |
for sentence in text_tmp: | |
control=sentence.replace(" ","") | |
if control == "": | |
pass | |
else: | |
sentences.append(sentence) | |
text=' '.join(sntc for sntc in sentences) | |
text=text.replace(' stream_size null x-parsed-by org.apache.tika.parser.defaultparser x-parsed-by org.apache.tika.parser.txt.txtparser stream_content_type application/txt content-encoding utf-8 content-type text/plain; charset=utf-8 ','') | |
tr_stopwords=stopwords.words('turkish') | |
tr_stopwords.extend([u"an",u"en",u"on",u"ön",u"ın", | |
u"un",u"ün",u"in",u"ta",u"te", | |
u"tu",u"tü",u"tı",u"ti",u"to", | |
u"tö",u"dan",u"den",u"dun", | |
u"dın",u"din"]) | |
textToken=utils.simple_preprocess(text) | |
textToken=list(set(textToken)-set(tr_stopwords)) | |
tagger="" | |
if categories == "dunya": | |
tagger="dunya_%i"%i | |
elif categories == "ekonomi": | |
tagger="ekonomi_%i"%i | |
elif categories == "kultur-sanat": | |
tagger="kultur-sanat_%i"%i | |
elif categories == "politika": | |
tagger="politika_%i" %i | |
elif categories == "spor": | |
tagger="spor_%i"%i | |
i+=1 | |
corpusList.append({'id': tagger, | |
'tokens': textToken, | |
'text':text}) | |
except: | |
pass | |
return corpusList | |
if __name__ == '__main__': | |
train_corpus=solrReadFile()#solr data corpus | |
utils.upload_chunked(server, train_corpus, | |
chunksize=1000) # send 1k docs at a time | |
# | |
server.train(train_corpus, method='lsi') # create a semantic model | |
server.index(train_corpus) | |
#test | |
testDataPath=os.path.join(os.path.dirname(__file__),"test") | |
filenames=os.listdir(testDataPath) | |
for filename in filenames: | |
print filename | |
f=io.open(os.path.join(testDataPath,filename), | |
mode = "r", | |
encoding = "utf8") | |
query=f.read().lower() | |
f.close() | |
doc = {'tokens': utils.simple_preprocess(query)} | |
print(server.find_similar(doc, min_score=0.43, max_results=50)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment