Skip to content

Instantly share code, notes, and snippets.

@macleginn
Created January 26, 2019 21:37
Show Gist options
  • Select an option

  • Save macleginn/f3c1ee994ceeea5413a85ccc4c24a7f1 to your computer and use it in GitHub Desktop.

Select an option

Save macleginn/f3c1ee994ceeea5413a85ccc4c24a7f1 to your computer and use it in GitHub Desktop.
Preprocess Berezkin
import os
import os.path
import sqlite3
from bs4 import BeautifulSoup
from sys import exit
WORKING_DIR = 'XXX'
INPUT_DIR = 'input_html'
os.chdir(WORKING_DIR)
fnames = [fname for fname in os.listdir(INPUT_DIR) if fname.endswith('html')]
conn = sqlite3.connect('myths.sqlite')
cursor = conn.cursor()
text_dic = {}
for f in fnames:
with open(os.path.join(INPUT_DIR, f), 'r', encoding='utf-8') as inp:
text_dic[ f[:f.find('.')] ] = inp.read()
all_strings = lambda tag: ''.join(tag.strings)
cursor.execute('delete from `motif_basic_info`')
cursor.execute('delete from `text_blobs_by_motif`')
for i, k in enumerate(text_dic):
tree = BeautifulSoup(text_dic[k], 'html.parser')
div = tree.find(id='main')
title = all_strings(div.find('p', 'NormalLin'))
description_tag = div.find('p', 'NormalLis')
if description_tag is None:
description = ''
else:
description = all_strings(description_tag)
# os.rename(f'{INPUT_DIR}/{k}.html', f'title_tested/{k}.html')
try:
cursor.execute('''insert into `motif_basic_info` (
`id`,
`motif_name`,
`motif_description`
) values (?,?,?)''',
(k, title, description))
except sqlite3.IntegrityError:
print(k)
print(div.find('p', 'NormalLin'))
print(div.find('p', 'NormalLis'))
exit(1)
texts = div.find_all('p', 'NormalMai')
for t in texts:
cursor.execute(
'''insert into `text_blobs_by_motif` (
`motif_id`,
`texts_by_region`
) values (?,?)''',
(k, str(t))
)
conn.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment