Last active
December 11, 2019 03:52
-
-
Save is3ka1/1f36f843b991f5853a66f074feb8ef1b to your computer and use it in GitHub Desktop.
A script that transform messages exported from Telegram to CSV file with some config file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[[source]] | |
name = "pypi" | |
url = "https://pypi.org/simple" | |
verify_ssl = true | |
[dev-packages] | |
[packages] | |
beautifulsoup4 = "*" | |
pyyaml = "*" | |
pandas = "*" | |
lxml = "*" | |
[requires] | |
python_version = "3.7" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
just make it be the title |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
messages: "div.history .message:not(.service)" | |
each_msg: | |
date: | |
selector: ".body:not(.forwarded) > .date[title]" | |
value: "v['title']" | |
name: | |
selector: ".body:not(.forwarded) > .from_name" | |
value: "v.get_text().strip()" | |
text: | |
selector: ".body:not(.forwarded) > .text" | |
value: "v.get_text().strip()" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import pandas as pd | |
from glob import glob | |
from os.path import join | |
from yaml import load, Loader | |
import re | |
class TGMsgLoader: | |
def __init__(self, msg_dir, config_path): | |
self.file_names = glob(join(msg_dir, 'messages*.html')) | |
sort_key_pattern = re.compile('messages(\d*).html') | |
self.file_names.sort( | |
key=lambda file_name: int(re.search(sort_key_pattern, file_name) | |
.groups()[0] or 0)) | |
with open(config_path) as fd: | |
self.sel_config: dict = load(fd.read(), Loader=Loader) | |
def extract(self): | |
for path in self.file_names: | |
print('*' * 10, path, '*' * 10) # debug | |
with open(path) as fd: | |
html_content = fd.read() | |
soup = BeautifulSoup(html_content, 'lxml') | |
msgs = soup.select(self.sel_config['messages']) | |
for msg in msgs: | |
result = dict() | |
for var, config in self.sel_config['each_msg'].items(): | |
v = msg.select_one(config['selector']) | |
if v is not None: | |
v = eval(config['value']) | |
result[var] = v | |
# print('{date}\n{name}\n{text}\n{sep_}\n\n'.format( | |
# sep_="=" * 30, **result)) | |
yield result | |
def save_to_csv(self, file_name='tmp.csv'): | |
df = pd.DataFrame([msg for msg in self.extract()]) | |
# hard code | |
df['name'] = df['name'].ffill() | |
df['date'] = pd.to_datetime(df['date'], format="%d.%m.%Y %H:%M:%S") | |
df.to_csv(file_name) | |
return df | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser( | |
description="A script that transform messages exported from Telegram" | |
" to CSV file with some config file.") | |
parser.add_argument('dir', help="The directory that message exported from" | |
" Telegram in.") | |
parser.add_argument('--file', help="The name CSV file will be.", | |
dest="file", default="tmp.csv") | |
parser.add_argument('--config', help="The configuration that content" | |
" on HTML should be", dest="config", | |
default="text_sels.yaml") | |
args = parser.parse_args() | |
tg_loader = TGMsgLoader(args.dir, args.config) | |
df = tg_loader.save_to_csv(args.file) | |
# tg_loader = TGMsgLoader('AIS3-chats/official', 'text_sels.yaml') | |
# df = tg_loader.save_to_csv('official.csv') |
Author
is3ka1
commented
Aug 2, 2019
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment