Skip to content

Instantly share code, notes, and snippets.

@ferayebend
Last active December 13, 2016 12:14
Show Gist options
  • Save ferayebend/1916365 to your computer and use it in GitHub Desktop.
Save ferayebend/1916365 to your computer and use it in GitHub Desktop.
KAT versiyon -0.1 (kaypakkaya analiz toolkit)
#!/usr/bin/jython
# -*- coding: utf-8 -*-
import sys
import os
import java.lang.System.out
import java.util.Arrays
import java.util.List
sys.path.append('~/social_data/zemberek2src/jar/zemberek-tr-2.1.1.jar')
sys.path.append('~/social_data/zemberek2src/jar/zemberek-cekirdek-2.1.1.jar')
from net.zemberek.erisim import Zemberek
from net.zemberek.tr.yapi import TurkiyeTurkcesi
def loadRaw(inputFile):
data = []
for line in inputFile:
if line.startswith("#"):
continue
data.append(line.strip().split())
return data
def temizle(kelime):
istenmeyenler = ['”',"“",",","\"","\'",":",";",".","’","?"]
for istenmeyen in istenmeyenler:
kelime = kelime.replace(istenmeyen,"")
return kelime
def zemb_temizle(kelime):
return str(kelime).split()[0][1:]
def cumlelerisay(counter,string):
cumlebitimi = ['!','?','.']
for noktalama in cumlebitimi:
if noktalama in string:
counter = counter + 1
return counter
def getsame(data):
result=[]
result.append(data[0])
for i in range(len(data)):
if data[i] in result:
continue
else:
result.append(data[i])
return result
def sorgula(array,referans,sonuc):
for i in range(len(referans)):
sonuc.append(0)
for element in array:
if referans[i]==element:
element = ''
sonuc[i] = sonuc[i]+1
return sonuc
if __name__ == '__main__':
zemberek = Zemberek(TurkiyeTurkcesi())
turg = zemberek.kokBulucu()#.kokBul()
giri = sys.argv[1]
metin = loadRaw(open(giri))
stemmed = []
out = open('wordl_out.txt','w')
cumlesayisi = 0
paragrafsayisi = len(metin) #numaralandirma ve bulletlar da dahil
#print metin
kokler = []
for paragraf in metin:
for kelime in paragraf:
cumlesayisi = cumlelerisay(cumlesayisi,kelime)
stmd = zemb_temizle(list(turg.kokBul(kelime)))
kokler.append(stmd)
for j in range(len(stmd)):
out.write(stmd[j]+' ') #noktalamalari temizle
out.close()
kelimesayisi = len(kokler)
''' histogram kismi, once ayni kelimeleri belirle'''
yunik_kok = getsame(kokler)
print 'yunik kok sayisi ', len(kokler)
''' krom yunik kokleri say '''
kok_sayisi = []
kok_sayisi = sorgula(kokler,yunik_kok,kok_sayisi)
#print kok_sayisi
''' istatistikleri dosyaya at '''
istatistik = open(giri+'_ist.csv','w')
istatistik.write("#***************** metin istatistikleri ********************\n")
istatistik.write("#kelime sayisi:"+str(kelimesayisi)+"\n#paragraf sayisi:"+str(paragrafsayisi)+"\n#cumle sayisi: "+str(cumlesayisi))
istatistik.write("\n#tekrar eden kelimeler ve tekrar sayilari istatistikleri\n")
if (len(kok_sayisi)==len(yunik_kok)):
for i in range(len(yunik_kok)):
istatistik.write(yunik_kok[i]+','+str(kok_sayisi[i])+'\n')
else:
print "eror: kok histograminda sorun var"
istatistik.close()
print "***************** metin istatistikleri ********************"
print "kelime sayisi:"+str(kelimesayisi)
print "paragraf sayisi:"+str(paragrafsayisi)
print "cumle sayisi: "+str(cumlesayisi)
#print stmd
#stemmed.append(turg.stemWords(data[i]))
#print turg.stemWord('ananın')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment