py wiki_to_text.py '/hdd/Downloads/simplewiki-20240120-pages-meta-current.xml.bz2' `pwd`/output/
Created
March 31, 2024 23:40
-
-
Save cheeseonamonkey/1cbee6b0a8d63c79e3ea4e9165233dfa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import tkinter as tk | |
from tkinter import filedialog, scrolledtext | |
import webbrowser | |
import os | |
import bz2 | |
import json | |
import re | |
from html2text import html2text as htt | |
import wikitextparser as wtp | |
from threading import Thread | |
import sys | |
def dewiki(text): | |
text = wtp.parse(text).plain_text() # wiki to plaintext | |
text = htt(text) # remove any HTML | |
text = text.replace('\\n',' ') # replace newlines | |
text = re.sub('\s+', ' ', text) # replace excess whitespace | |
return text | |
def analyze_chunk(text): | |
try: | |
if '<redirect title="' in text: # this is not the main article | |
return None | |
if '(disambiguation)' in text: # this is not an article | |
return None | |
else: | |
title = text.split('<title>')[1].split('</title>')[0] | |
title = htt(title) | |
if ':' in title: # most articles with : in them are not articles we care about | |
return None | |
serial = text.split('<id>')[1].split('</id>')[0] | |
content = text.split('</text')[0].split('<text')[1].split('>', maxsplit=1)[1] | |
content = dewiki(content) | |
return {'title': title.strip(), 'text': content.strip(), 'id': serial.strip()} | |
except Exception as oops: | |
print(oops) | |
return None | |
def save_article(article, savedir, log_text): | |
doc = analyze_chunk(article) | |
if doc: | |
title = doc['title'] | |
# Replace slashes with underscores in the title | |
filename = title.replace('/', '_') + '.txt' | |
filepath = os.path.join(savedir, filename) | |
if not os.path.exists(filepath): # Check if file exists | |
with open(filepath, 'w', encoding='utf-8') as outfile: | |
outfile.write(doc['text']) | |
log_text.insert(tk.END, filename.ljust(55) + "\n") | |
print(filename) # Print filename to console | |
else: | |
log_text.insert(tk.END, f"File {filename} already exists, skipping...\n") | |
def process_file_text(filename, savedir, log_text): | |
try: | |
article = '' | |
with open(filename, 'r', encoding='utf-8') as infile: | |
for line in infile: | |
if '<page>' in line: | |
article = '' | |
elif '</page>' in line: # end of article | |
Thread(target=save_article, args=(article, savedir, log_text)).start() | |
else: | |
article += line | |
except Exception as e: | |
log_text.insert(tk.END, f"Error processing article: {str(e)}\n") | |
print(f"Error processing article: {str(e)}") | |
def browse_file(entry): | |
filename = filedialog.askopenfilename() | |
entry.delete(0, tk.END) | |
entry.insert(0, filename) | |
def browse_directory(entry): | |
directory = filedialog.askdirectory() | |
entry.delete(0, tk.END) | |
entry.insert(0, directory) | |
def decompress_file(xml_file): | |
if xml_file.endswith('.bz2'): | |
decompressed_file = xml_file[:-4] # Remove .bz2 extension | |
with open(decompressed_file, 'wb') as f_out, bz2.BZ2File(xml_file, 'rb') as f_in: | |
f_out.write(f_in.read()) | |
return decompressed_file | |
return xml_file | |
def convert_to_json(xml_file, json_dir, decompress, log_text): | |
log_text.delete(1.0, tk.END) # Clear previous logs | |
log_text.insert(tk.END, f"Starting conversion:\n -input: {xml_file}\n") | |
log_text.insert(tk.END, f" -output: {json_dir}\n") | |
log_text.update() | |
log_text.insert(tk.END, "Decompressing...\n") | |
log_text.update() | |
xml_file = decompress_file(xml_file) if decompress else xml_file | |
log_text.insert(tk.END, f"Decompressed: {xml_file}\n") | |
log_text.update() | |
log_text.insert(tk.END, "Conversion in progress...\n") | |
log_text.update() | |
process_file_text(xml_file, json_dir, log_text) | |
log_text.insert(tk.END, "Conversion completed!\n") | |
log_text.update() | |
if decompress and xml_file.endswith('.bz2'): | |
os.remove(xml_file) | |
def open_link(url): | |
webbrowser.open_new(url) | |
def main(): | |
parser = argparse.ArgumentParser(description='Process Wikipedia XML dump file and save as text files.') | |
parser.add_argument('xml_file', help='Wikipedia XML dump file (.xml or .xml.bz2)') | |
parser.add_argument('json_dir', help='Directory to save the converted JSON files') | |
args = parser.parse_args() | |
window = tk.Tk() | |
window.title("Wikipedia to text") | |
# Widgets | |
tk.Label(window, text="Wikipedia XML dump file:").grid(row=0, column=0, padx=5, pady=5) | |
xml_entry = tk.Entry(window, width=45) | |
xml_entry.grid(row=0, column=1, padx=5, pady=5) | |
xml_entry.insert(0, args.xml_file) | |
tk.Label(window, text="JSON save directory:").grid(row=1, column=0, padx=5, pady=5) | |
json_entry = tk.Entry(window, width=45) | |
json_entry.grid(row=1, column=1, padx=5, pady=5) | |
json_entry.insert(0, args.json_dir) | |
decompress_var = tk.BooleanVar() | |
decompress_check = tk.Checkbutton(window, text="Decompress if .bz2", variable=decompress_var) | |
decompress_check.grid(row=2, columnspan=3, padx=5, pady=5) | |
log_text = scrolledtext.ScrolledText(window, width=70, height=13) | |
log_text.grid(row=3, columnspan=3, padx=5, pady=5) | |
convert_button = tk.Button(window, text="Convert", command=lambda: convert_to_json(xml_entry.get(), json_entry.get(), decompress_var.get(), log_text)) | |
convert_button.grid(row=4, columnspan=3, padx=5, pady=5) | |
# Add links | |
link_text = tk.Text(window, height=1, width=65) | |
link_text.grid(row=5, columnspan=3, padx=5, pady=5) | |
link_text.insert(tk.END, "Compressed Wikipedia dumps: ") | |
link_text.tag_configure("link", foreground="blue", underline=True) | |
link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/enwiki/20240120/")) | |
link_text.insert(tk.END, "English (21GB)", "link") | |
link_text.insert(tk.END, " | ") | |
link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/simplewiki/20240120/")) | |
link_text.insert(tk.END, "Simple English (1GB)", "link") | |
link_text.config(state=tk.DISABLED) | |
window.mainloop() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment