Last active
July 29, 2023 15:06
-
-
Save clepz/70c6758bf58678ff14c756a5a29f4063 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
import sqlite3 | |
from sqlite3 import Error | |
import urllib | |
import HTMLParser | |
import html2text | |
import re | |
from os import listdir | |
from os.path import isfile, join | |
from collections import Counter | |
indexList = [] | |
indexListTr = [] | |
lang_code = '' | |
inIgnorePrg = False | |
def create_connection(db_file): | |
""" create a database connection to a SQLite database """ | |
conn = None | |
try: | |
conn = sqlite3.connect(db_file) | |
except Error as e: | |
print(e) | |
return conn | |
def insertP(conn, values): | |
sql = ''' INSERT INTO paragraphs (paragraph_name,section_id,paragraph_text_tr,paragraph_text_other) VALUES(?,?,?,?) ''' | |
with conn: | |
cur = conn.cursor() | |
cur.execute(sql, values) | |
return cur.lastrowid | |
def insertBook(conn, values): | |
sql = ''' INSERT INTO books (book_name,lang_code) VALUES(?,?) ''' | |
with conn: | |
cur = conn.cursor() | |
cur.execute(sql, values) | |
return cur.lastrowid | |
def insertSection(conn, values): | |
sql = ''' INSERT INTO booksSections (book_id,section_name) VALUES(?,?) ''' | |
with conn: | |
cur = conn.cursor() | |
cur.execute(sql, values) | |
return cur.lastrowid | |
def insertIndex(conn, values, lang_code): | |
sql = ''' INSERT INTO indexList (lang_code,word, count) VALUES(?,?,?)''' | |
with conn: | |
cur = conn.cursor() | |
for value in sorted(values): | |
if (value != ''): | |
cur.execute(sql, (lang_code, value, values[value])) | |
def tireIsaretiDuzelt(value): | |
for x in range(0, len(value)): | |
if (value[x] != ''): | |
if (value[x][0] == '-'): | |
value[x] = value[x][1:-1] | |
if (value[x] == ''): | |
continue | |
if (value[x][-1] == '-'): | |
value[x] = value[x][0:-2] | |
if (value[x] == ''): | |
continue | |
class MyHTMLParser(HTMLParser.HTMLParser, object): | |
def __init__(self, conn): | |
super(MyHTMLParser, self).__init__() | |
self.yazdir = False | |
self.devam = False | |
self.yazdirOther = False | |
self.devamOther = False | |
self.conn = conn | |
self.prgSayi = 1 | |
self.cumle = "" | |
self.baslik = False | |
self.baslikOther = False | |
self.attr = "" | |
self.cumleOther = "" | |
self.section_id = -1 | |
def handle_starttag(self, tag, attrs): | |
# print("Found a start tag:", tag) | |
global inIgnorePrg | |
if (len(attrs) is not 0): | |
if (tag == "tr" and "IgnorePrg" in attrs[0][1] ): | |
inIgnorePrg = True | |
elif (tag == "tr" and "row-Paragraph" in attrs[0][1]): | |
inIgnorePrg = False | |
if (len(attrs) is not 0): | |
if (attrs[0][1] == "col-OTHER"): | |
self.devamOther = True | |
if (self.devamOther): | |
if (tag == "p"): | |
if inIgnorePrg: | |
baslik = False | |
for attr in attrs: | |
if "başlık" in attr[1].lower(): | |
baslik = True | |
if not baslik: | |
self.devamOther = False | |
self.yazdirOther = False | |
return | |
self.yazdirOther = True | |
if (len(attrs) is not 0): | |
if (attrs[0][1] == "col-TR"): | |
self.devam = True | |
if (self.devam): | |
if (tag == "p"): | |
if inIgnorePrg: | |
baslik = False | |
for attr in attrs: | |
if "başlık" in attr[1].lower(): | |
baslik = True | |
if not baslik: | |
self.devam = False | |
self.yazdir = False | |
self.yazdirOther = False | |
return | |
self.yazdir = True | |
self.yazdirOther = False | |
self.devamOther = False | |
if self.attr == '': | |
for attr in attrs: | |
if attr[0] == "name": | |
self.attr = attr[1] | |
if (tag == "a"): | |
for attr in attrs: # [ (class,value), (name,value) ] | |
if (attr[0] == "name"): | |
self.attr = attr[1] | |
self.baslik = True | |
# print self.attr | |
# diger dilden yazilari eklemek icin kullaniliyor. | |
def handle_endtag(self, tag): | |
if (tag == 'td' and self.cumle != ''): | |
cumleKontrol = self.cumle.strip() | |
self.cumle = self.cumle.replace("\n", " ") | |
self.cumle = re.sub(u' +', ' ', self.cumle) | |
self.cumle = re.sub(u'\\t+', ' ', self.cumle) | |
self.cumle = re.sub(u'[‘’‚„\'`´’¿¡]', '', self.cumle) | |
self.cumle = self.cumle.replace(u"I", u"ı") | |
self.cumle = self.cumle.lower() | |
self.cumle = self.cumle.replace(u'â', u'a').replace(u'û', u'u').replace(u'î', u'i') | |
self.cumleOther = self.cumleOther.replace("\n", " ") | |
self.cumleOther = re.sub(u' +', ' ', self.cumleOther) | |
self.cumleOther = re.sub(u'\\t+', ' ', self.cumleOther) | |
self.cumleOther = re.sub(u'[‘’‚„\'`´¿¡]', '', self.cumleOther) | |
self.cumleOther = self.cumleOther.lower() | |
if (u'\u2026' in cumleKontrol[-2:] or '*' in cumleKontrol[-2:] or ')' in cumleKontrol[-2:] or '.' in cumleKontrol[ | |
-1] or '.' in cumleKontrol[-3:] or '!' in cumleKontrol[-3:] or ':' in cumleKontrol[-3:] or '?' in cumleKontrol[ | |
-3:] or self.baslik): | |
global indexList | |
global indexListTr | |
global lang_code | |
if (lang_code == 'EN'): | |
indexListStringTr = nonLetterKarakterleriKaldir(self.cumle).strip() | |
indexListStringTr = arapcalariKaldir(indexListStringTr).strip() | |
indexListStringTr = indexListStringTr.split(' ') | |
tireIsaretiDuzelt(indexListStringTr) | |
indexListTr.extend(indexListStringTr) | |
indexStringOther = nonLetterKarakterleriKaldir(self.cumleOther).strip() | |
if (lang_code == "AR" or lang_code == "FA"): | |
indexStringOther = harekeleriKaldir(indexStringOther).strip() | |
self.cumleOther = harekeleriKaldir(self.cumleOther).strip() | |
else: | |
indexStringOther = arapcalariKaldir(indexStringOther).strip() | |
indexStringOther = indexStringOther.split(' ') | |
tireIsaretiDuzelt(indexStringOther) | |
indexList.extend(indexStringOther) | |
self.cumle = self.cumle.strip() | |
self.cumleOther = self.cumleOther.strip() | |
self.cumle = " " + self.cumle | |
self.cumleOther = " " + self.cumleOther | |
self.yazdir = False | |
self.devam = False | |
insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther)) | |
self.yazdirOther = False | |
self.devamOther = False | |
# print self.cumleOther | |
self.cumleOther = '' | |
# print self.cumle | |
# print '--------------------------------' | |
self.cumle = '' | |
self.attr = '' | |
if self.baslik: | |
self.baslik = False | |
else: | |
self.devam = False | |
self.yazdir = False | |
self.devamOther = False | |
self.yazdirOther = False | |
def handle_data(self, data): | |
data = unicode(data, 'utf-8') | |
if (self.yazdir): | |
self.cumle += data | |
if (self.yazdirOther): | |
self.cumleOther += data | |
def sectionSonuEkleme(self): | |
if self.cumle != '': | |
self.yazdir = False | |
self.devam = False | |
self.yazdirOther = False | |
self.devamOther = False | |
# print self.cumleOther | |
insertP(self.conn, (self.attr.decode(encoding="utf-8"), self.section_id, self.cumle, self.cumleOther)) | |
self.cumleOther = '' | |
# print self.cumle | |
# print '--------------------------------' | |
self.cumle = '' | |
self.attr = '' | |
if self.baslik: | |
self.baslik = False | |
if self.baslikOther: | |
self.baslikOther = False | |
def nonLetterKarakterleriKaldir(text): | |
text = re.sub(r'[,.)(«!:?/۞»;#◌\*\}\{\[\]\'\"]|[0-9]', '', text) | |
text = re.sub(u"[؛،”“…'ِ,﴾﴿ﷺ◌◌◌—]", "", text, flags=re.UNICODE) | |
return text | |
def arapcalariKaldir(text): | |
""" | |
test = unicode("شدَد", encoding='utf-8') | |
test = re.sub(u"[\u064e\u0634]", "", test, flags=re.UNICODE) | |
""" | |
text = re.sub( | |
u"[\u0600-\u06ff]|[\u0750-\u077f]|[\ufb50-\ufbc1]|[\ufbd3-\ufd3f]|[\ufd50-\ufd8f]|[\ufd92-\ufdc7]|[\ufe70-\ufefc]|[\uFDF0-\uFDFD]", | |
'', text, flags=re.UNICODE) | |
return text | |
def harekeleriKaldir(text): | |
text = text.encode('utf-8') | |
noise = re.compile(""" ّ | # Tashdid | |
َ | # Fatha | |
ً | # Tanwin Fath | |
ُ | # Damma | |
ٌ | # Tanwin Damm | |
ِ | # Kasra | |
ٍ | # Tanwin Kasr | |
ْ | # Sukun | |
ـ # Tatwil/Kashida | |
""", re.VERBOSE) | |
text = re.sub(noise, '', text) | |
return text.decode('utf-8') | |
if __name__ == '__main__': | |
conn = create_connection("/home/clepz/Documents/RisaleKarsilastirmali/Risaleler.db") | |
parser = MyHTMLParser(conn) | |
mypath = "/home/clepz/Documents/RisaleKarsilastirmali/AppKarsilastirmaliKitaplar9Subat2020/books" | |
langs = {'ing': 'EN', 'arabi': 'AR', 'rusca': 'RU', 'almanca': 'DE', 'ozbek': 'UZ', | |
'jap': 'JA', 'isp': 'ES', 'farsi': 'FA', 'endonezce': "ID", 'cince': "CN", | |
'fransizca': 'FR'} | |
folders = [folder for folder in listdir(mypath)] # if "ing" in folder ] | |
# test = {'ing':'en'} | |
for name, lang_code in langs.items(): | |
folderNames = [i for i in folders if i.startswith(name)] | |
for f in folderNames: | |
print f | |
# kitabi ekle | |
book_id = insertBook(conn, (f, lang_code)) | |
sectionsPath = mypath + "/" + f | |
print sectionsPath + " ----- " + str(book_id) | |
# kitap icerisindeki sectionlari al | |
sections = listdir(sectionsPath) | |
sections.sort() | |
for section in sections: # her sectionlarin icini oku ve databaseye ekle.... | |
sectionName = re.findall("section-[0-9]+", section) | |
if sectionName.__len__() != 0: | |
parser.section_id = insertSection(conn, (book_id, sectionName[0])) | |
print sectionsPath + "/" + section | |
page = urllib.urlopen(sectionsPath + "/" + section).read() | |
parser.feed(page) | |
parser.sectionSonuEkleme() | |
# sectionlar bittiginde diger dosyayla devam et | |
# o dilin kitaplari bittiginde. | |
if (lang_code != 'JA'): | |
values = Counter(indexList) | |
insertIndex(conn, values, lang_code) | |
if (lang_code == 'EN'): | |
values = Counter(indexListTr) | |
insertIndex(conn, values, "TR") | |
print 'tr dili indexi eklendi.' | |
indexListTr = None | |
print lang_code + ' dili indexi eklendi' | |
indexList = [] | |
# html = open("/home/clepz/Documents/RisaleKarsilastirmali/arabi23soz/arabi23soz-section-0-normal.html").read() | |
# paragraflar = html2text.html2text(html.decode("utf-8")).split('|') | |
# count = 0 | |
""" | |
for prg in paragraflar: | |
if count % 2 == 0: | |
print prg | |
count = count + 1 | |
""" | |
""" | |
listem = [] | |
for string in page.split("<td class=\"col-TR\">"): | |
listem.append(string.split("</td>")) | |
print (listem[0]) | |
""" | |
# print (type(page)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment