Last active
December 13, 2016 12:14
-
-
Save ferayebend/1916365 to your computer and use it in GitHub Desktop.
KAT versiyon -0.1 (kaypakkaya analiz toolkit)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/jython | |
# -*- coding: utf-8 -*- | |
import sys | |
import os | |
import java.lang.System.out | |
import java.util.Arrays | |
import java.util.List | |
sys.path.append('~/social_data/zemberek2src/jar/zemberek-tr-2.1.1.jar') | |
sys.path.append('~/social_data/zemberek2src/jar/zemberek-cekirdek-2.1.1.jar') | |
from net.zemberek.erisim import Zemberek | |
from net.zemberek.tr.yapi import TurkiyeTurkcesi | |
def loadRaw(inputFile): | |
data = [] | |
for line in inputFile: | |
if line.startswith("#"): | |
continue | |
data.append(line.strip().split()) | |
return data | |
def temizle(kelime): | |
istenmeyenler = ['”',"“",",","\"","\'",":",";",".","’","?"] | |
for istenmeyen in istenmeyenler: | |
kelime = kelime.replace(istenmeyen,"") | |
return kelime | |
def zemb_temizle(kelime): | |
return str(kelime).split()[0][1:] | |
def cumlelerisay(counter,string): | |
cumlebitimi = ['!','?','.'] | |
for noktalama in cumlebitimi: | |
if noktalama in string: | |
counter = counter + 1 | |
return counter | |
def getsame(data): | |
result=[] | |
result.append(data[0]) | |
for i in range(len(data)): | |
if data[i] in result: | |
continue | |
else: | |
result.append(data[i]) | |
return result | |
def sorgula(array,referans,sonuc): | |
for i in range(len(referans)): | |
sonuc.append(0) | |
for element in array: | |
if referans[i]==element: | |
element = '' | |
sonuc[i] = sonuc[i]+1 | |
return sonuc | |
if __name__ == '__main__': | |
zemberek = Zemberek(TurkiyeTurkcesi()) | |
turg = zemberek.kokBulucu()#.kokBul() | |
giri = sys.argv[1] | |
metin = loadRaw(open(giri)) | |
stemmed = [] | |
out = open('wordl_out.txt','w') | |
cumlesayisi = 0 | |
paragrafsayisi = len(metin) #numaralandirma ve bulletlar da dahil | |
#print metin | |
kokler = [] | |
for paragraf in metin: | |
for kelime in paragraf: | |
cumlesayisi = cumlelerisay(cumlesayisi,kelime) | |
stmd = zemb_temizle(list(turg.kokBul(kelime))) | |
kokler.append(stmd) | |
for j in range(len(stmd)): | |
out.write(stmd[j]+' ') #noktalamalari temizle | |
out.close() | |
kelimesayisi = len(kokler) | |
''' histogram kismi, once ayni kelimeleri belirle''' | |
yunik_kok = getsame(kokler) | |
print 'yunik kok sayisi ', len(kokler) | |
''' krom yunik kokleri say ''' | |
kok_sayisi = [] | |
kok_sayisi = sorgula(kokler,yunik_kok,kok_sayisi) | |
#print kok_sayisi | |
''' istatistikleri dosyaya at ''' | |
istatistik = open(giri+'_ist.csv','w') | |
istatistik.write("#***************** metin istatistikleri ********************\n") | |
istatistik.write("#kelime sayisi:"+str(kelimesayisi)+"\n#paragraf sayisi:"+str(paragrafsayisi)+"\n#cumle sayisi: "+str(cumlesayisi)) | |
istatistik.write("\n#tekrar eden kelimeler ve tekrar sayilari istatistikleri\n") | |
if (len(kok_sayisi)==len(yunik_kok)): | |
for i in range(len(yunik_kok)): | |
istatistik.write(yunik_kok[i]+','+str(kok_sayisi[i])+'\n') | |
else: | |
print "eror: kok histograminda sorun var" | |
istatistik.close() | |
print "***************** metin istatistikleri ********************" | |
print "kelime sayisi:"+str(kelimesayisi) | |
print "paragraf sayisi:"+str(paragrafsayisi) | |
print "cumle sayisi: "+str(cumlesayisi) | |
#print stmd | |
#stemmed.append(turg.stemWords(data[i])) | |
#print turg.stemWord('ananın') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment