Created
October 5, 2021 19:12
-
-
Save Xoma163/1f93760c1877769b63043153cdb9a432 to your computer and use it in GitHub Desktop.
Simple vk dump messages converter to json format (200k rows about 10 sec)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import time | |
from datetime import datetime | |
from joblib import Parallel, cpu_count, delayed | |
from bs4 import BeautifulSoup | |
class VkMessagesDumpConverter: | |
MONTH_TRANSLATOR = { | |
'янв': 'Jan', | |
'фев': 'Feb', | |
'мар': 'Mar', | |
'апр': 'Apr', | |
'мая': 'May', | |
'июн': 'Jun', | |
'июл': 'Jul', | |
'авг': 'Aug', | |
'сен': 'Sep', | |
'окт': 'Oct', | |
'ноя': 'Nov', | |
'дек': 'Dec', | |
} | |
DEFAULT_AUTHOR_NAME = "Вы" | |
def __init__(self, path, author_name=None): | |
""" | |
:param path: путь к папке, в которой хранится множество .pdf файлов с перепиской | |
:param author_name: имя пользователя, от которого делался дамп переписки. Нужно для замены на корректное имя | |
""" | |
if not author_name: | |
author_name = self.DEFAULT_AUTHOR_NAME | |
self.input_path = path | |
all_pdf_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] | |
self.all_pdf_files = sorted(all_pdf_files, key=lambda x: self._get_file_part_number(x)) | |
self.output_path = f"{path.split('/')[-1]}.json" # current folder. {chat_name}.json | |
self.author_name = author_name | |
@staticmethod | |
def _get_file_part_number(filename): | |
""" | |
Получение номера файла. Требуется для корректной сортировки | |
:param filename: название файла | |
:return: номер файла | |
""" | |
return int(filename.replace("messages", '').replace('.html', '')) | |
def to_json(self): | |
parsed = self._parse() | |
json_dumps = json.dumps(parsed, ensure_ascii=False, indent=2) | |
with open(self.output_path, 'w', encoding='utf-8', ) as file: | |
file.write(json_dumps) | |
def _parse(self): | |
start_time = time.time() | |
results = parallel(self._parse_file, self.all_pdf_files) | |
# results = [] | |
# for file in self.all_pdf_files: | |
# results.append(self._parse_file(file)) | |
flat_results = [item for sublist in results for item in sublist] | |
flat_results.reverse() | |
print(time.time() - start_time) | |
return flat_results | |
def _parse_file(self, file): | |
all_messages = [] | |
with open(os.path.join(self.input_path, file), 'r') as pdf: | |
pdf_body = pdf.read() | |
bs4 = BeautifulSoup(pdf_body, 'html.parser') | |
items = bs4.select('.item') | |
for item in items: | |
parsed_message = self._parse_item(item) | |
all_messages.append(parsed_message) | |
return all_messages | |
def _parse_item(self, item): | |
author_dt_div = item.find('div', { | |
'class': 'message__header' | |
}) | |
author, dt = author_dt_div.text.split(', ') | |
if author == self.DEFAULT_AUTHOR_NAME: | |
author = self.author_name | |
flag_edited = False | |
edited_text = ' (ред.)' | |
edited_pos = dt.find(edited_text) | |
if edited_pos != -1: | |
flag_edited = True | |
dt = dt.replace(edited_text, '') | |
dt_day, dt_month_rus, dt_year, _, dt_time = dt.split(' ') | |
dt_month_eng = self.translate_month(dt_month_rus) | |
datetime_obj = datetime.strptime(f"{dt_day} {dt_month_eng} {dt_year} {dt_time}", '%d %b %Y %X') | |
datetime_str = datetime_obj.strftime("%d.%m.%Y %X") | |
text_attachments_div = author_dt_div.find_next_sibling('div') | |
text = '' | |
if len(text_attachments_div.contents) > 1: | |
text = text_attachments_div.contents[0].strip() | |
flag_fwd = False | |
attachments = [] | |
attachments_div = text_attachments_div.find_all('div', { | |
'class': 'attachment' | |
}) | |
if attachments_div: | |
for attachment in attachments_div: | |
att_dict = {} | |
att_description = attachment.find('div', { | |
'class': "attachment__description" | |
}) | |
if "прикрепл" in att_description.text: | |
flag_fwd = True | |
else: | |
att_dict = { | |
'type': att_description.text | |
} | |
if att_description.text in ["Запись на стене", "Стикер", "Аудиозапись", | |
"Запрос на денежный перевод", "История", "Комментарий на стене"]: | |
pass | |
elif att_description.text in ["Фотография", "Видеозапись", "Файл", "Ссылка"]: | |
att_link = attachment.find('a', { | |
'class': 'attachment__link' | |
}).attrs['href'] | |
att_dict['link'] = att_link | |
if att_link.endswith('.ogg'): | |
att_dict['type'] = "Голосовое сообщение" | |
else: | |
print(att_description.text) | |
if att_dict: | |
attachments.append(att_dict) | |
parsed_message = { | |
'author': author, | |
'datetime': datetime_str, | |
'text': text, | |
'attachments': attachments, | |
'edited': flag_edited, | |
'fwd': flag_fwd | |
} | |
return parsed_message | |
def translate_month(self, rus_month): | |
return self.MONTH_TRANSLATOR[rus_month] | |
def parallel(method, data, threads_count=None): | |
""" | |
Разделяет задачу на множество подзадач в многопотоке | |
:param method: метод, в котором будет выполняться 1 поток | |
:param data: данные, которые будут доступны в методе | |
:param threads_count: кол-во потоков | |
""" | |
if threads_count is None: | |
threads_count = cpu_count() | |
return Parallel(n_jobs=threads_count)(delayed(method)(item) for item in data) | |
if __name__ == "__main__": | |
input_filename = "" | |
vk_mdc = VkMessagesDumpConverter("chats/%chatname%", "first_name second_name") | |
vk_mdc.to_json() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment