This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#memisahkan berdasarkan kalimat | |
def senttoken(): #Bagi per kalimat | |
kalimat = input() #tambah .lower() untuk melakukan case folding sekaligus | |
kalimat = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s', kalimat) | |
print(kalimat) | |
return kalimat |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Teks harus sudah melalui proses word tokenizing terlebih dahulu. | |
def tf(sudahDiTokenize): #Term Frequency | |
wordlist = sudahDiTokenize | |
#flat_list = [item for sublist in wordlist for item in sublist] #bila memakai tf normalized | |
#jumkata = len(flat_list) # bila memakai tf normalized | |
wordfreq = {} | |
for w in wordlist: | |
for o in w: | |
wordfreq[o] = wordfreq.get(o,0) + 1 |