cheeseonamonkey · March 31, 2024 23:40
diff --git a/usage.md b/usage.md
diff --git a/wiki_to_text.py b/wiki_to_text.py
 import argparse
 import tkinter as tk
 from tkinter import filedialog, scrolledtext
 import webbrowser
 import os
 import bz2
 import json
 import re
 from html2text import html2text as htt
 import wikitextparser as wtp
 from threading import Thread
 import sys

 def dewiki(text):
    text = wtp.parse(text).plain_text()  # wiki to plaintext 
    text = htt(text)  # remove any HTML
    text = text.replace('\\n',' ')  # replace newlines
    text = re.sub('\s+', ' ', text)  # replace excess whitespace
    return text

 def analyze_chunk(text):
    try:
        if '<redirect title="' in text:  # this is not the main article
            return None
        if '(disambiguation)' in text:  # this is not an article
            return None
        else:
            title = text.split('<title>')[1].split('</title>')[0]
            title = htt(title)
            if ':' in title:  # most articles with : in them are not articles we care about
                return None
        serial = text.split('<id>')[1].split('</id>')[0]
        content = text.split('</text')[0].split('<text')[1].split('>', maxsplit=1)[1]
        content = dewiki(content)
        return {'title': title.strip(), 'text': content.strip(), 'id': serial.strip()}
    except Exception as oops:
        print(oops)
        return None
    

 def save_article(article, savedir, log_text):
    doc = analyze_chunk(article)
    if doc:
        title = doc['title']
        # Replace slashes with underscores in the title
        filename = title.replace('/', '_') + '.txt'
        filepath = os.path.join(savedir, filename)
        if not os.path.exists(filepath):  # Check if file exists
            with open(filepath, 'w', encoding='utf-8') as outfile:
                outfile.write(doc['text'])
            log_text.insert(tk.END, filename.ljust(55) + "\n")
            print(filename)  # Print filename to console
        else:
            log_text.insert(tk.END, f"File {filename} already exists, skipping...\n")



 def process_file_text(filename, savedir, log_text):
    try:
        article = ''
        with open(filename, 'r', encoding='utf-8') as infile:
            for line in infile:
                if '<page>' in line:
                    article = ''
                elif '</page>' in line:  # end of article
                    Thread(target=save_article, args=(article, savedir, log_text)).start()
                else:
                    article += line
    except Exception as e:
        log_text.insert(tk.END, f"Error processing article: {str(e)}\n")
        print(f"Error processing article: {str(e)}")


 def browse_file(entry):
    filename = filedialog.askopenfilename()
    entry.delete(0, tk.END)
    entry.insert(0, filename)

 def browse_directory(entry):
    directory = filedialog.askdirectory()
    entry.delete(0, tk.END)
    entry.insert(0, directory)

 def decompress_file(xml_file):
    if xml_file.endswith('.bz2'):
        decompressed_file = xml_file[:-4]  # Remove .bz2 extension
        with open(decompressed_file, 'wb') as f_out, bz2.BZ2File(xml_file, 'rb') as f_in:
            f_out.write(f_in.read())
        return decompressed_file
    return xml_file

 def convert_to_json(xml_file, json_dir, decompress, log_text):
    log_text.delete(1.0, tk.END)  # Clear previous logs
    log_text.insert(tk.END, f"Starting conversion:\n -input: {xml_file}\n")
    log_text.insert(tk.END, f" -output:  {json_dir}\n")
    log_text.update()
    log_text.insert(tk.END, "Decompressing...\n")
    log_text.update()

    xml_file = decompress_file(xml_file) if decompress else xml_file

    log_text.insert(tk.END, f"Decompressed: {xml_file}\n")
    log_text.update()
    
    log_text.insert(tk.END, "Conversion in progress...\n")
    log_text.update()

    process_file_text(xml_file, json_dir, log_text)

    log_text.insert(tk.END, "Conversion completed!\n")
    log_text.update()

    if decompress and xml_file.endswith('.bz2'):
        os.remove(xml_file)

 def open_link(url):
    webbrowser.open_new(url)

 def main():
    parser = argparse.ArgumentParser(description='Process Wikipedia XML dump file and save as text files.')
    parser.add_argument('xml_file', help='Wikipedia XML dump file (.xml or .xml.bz2)')
    parser.add_argument('json_dir', help='Directory to save the converted JSON files')
    args = parser.parse_args()

    window = tk.Tk()
    window.title("Wikipedia to text")

    # Widgets
    tk.Label(window, text="Wikipedia XML dump file:").grid(row=0, column=0, padx=5, pady=5)
    xml_entry = tk.Entry(window, width=45)
    xml_entry.grid(row=0, column=1, padx=5, pady=5)
    xml_entry.insert(0, args.xml_file)

    tk.Label(window, text="JSON save directory:").grid(row=1, column=0, padx=5, pady=5)
    json_entry = tk.Entry(window, width=45)
    json_entry.grid(row=1, column=1, padx=5, pady=5)
    json_entry.insert(0, args.json_dir)

    decompress_var = tk.BooleanVar()
    decompress_check = tk.Checkbutton(window, text="Decompress if .bz2", variable=decompress_var)
    decompress_check.grid(row=2, columnspan=3, padx=5, pady=5)

    log_text = scrolledtext.ScrolledText(window, width=70, height=13)
    log_text.grid(row=3, columnspan=3, padx=5, pady=5)

    convert_button = tk.Button(window, text="Convert", command=lambda: convert_to_json(xml_entry.get(), json_entry.get(), decompress_var.get(), log_text))
    convert_button.grid(row=4, columnspan=3, padx=5, pady=5)

    # Add links
    link_text = tk.Text(window, height=1, width=65)
    link_text.grid(row=5, columnspan=3, padx=5, pady=5)
    link_text.insert(tk.END, "Compressed Wikipedia dumps: ")
    link_text.tag_configure("link", foreground="blue", underline=True)
    link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/enwiki/20240120/"))
    link_text.insert(tk.END, "English (21GB)", "link")
    link_text.insert(tk.END, " | ")
    link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/simplewiki/20240120/"))
    link_text.insert(tk.END, "Simple English (1GB)", "link")
    link_text.config(state=tk.DISABLED)

    window.mainloop()

 if __name__ == "__main__":
    main()
	import argparse
	import tkinter as tk
	from tkinter import filedialog, scrolledtext
	import webbrowser
	import os
	import bz2
	import json
	import re
	from html2text import html2text as htt
	import wikitextparser as wtp
	from threading import Thread
	import sys

	def dewiki(text):
	text = wtp.parse(text).plain_text() # wiki to plaintext
	text = htt(text) # remove any HTML
	text = text.replace('\\n',' ') # replace newlines
	text = re.sub('\s+', ' ', text) # replace excess whitespace
	return text

	def analyze_chunk(text):
	try:
	if '<redirect title="' in text: # this is not the main article
	return None
	if '(disambiguation)' in text: # this is not an article
	return None
	else:
	title = text.split('<title>')[1].split('</title>')[0]
	title = htt(title)
	if ':' in title: # most articles with : in them are not articles we care about
	return None
	serial = text.split('<id>')[1].split('</id>')[0]
	content = text.split('</text')[0].split('<text')[1].split('>', maxsplit=1)[1]
	content = dewiki(content)
	return {'title': title.strip(), 'text': content.strip(), 'id': serial.strip()}
	except Exception as oops:
	print(oops)
	return None


	def save_article(article, savedir, log_text):
	doc = analyze_chunk(article)
	if doc:
	title = doc['title']
	# Replace slashes with underscores in the title
	filename = title.replace('/', '_') + '.txt'
	filepath = os.path.join(savedir, filename)
	if not os.path.exists(filepath): # Check if file exists
	with open(filepath, 'w', encoding='utf-8') as outfile:
	outfile.write(doc['text'])
	log_text.insert(tk.END, filename.ljust(55) + "\n")
	print(filename) # Print filename to console
	else:
	log_text.insert(tk.END, f"File {filename} already exists, skipping...\n")



	def process_file_text(filename, savedir, log_text):
	try:
	article = ''
	with open(filename, 'r', encoding='utf-8') as infile:
	for line in infile:
	if '<page>' in line:
	article = ''
	elif '</page>' in line: # end of article
	Thread(target=save_article, args=(article, savedir, log_text)).start()
	else:
	article += line
	except Exception as e:
	log_text.insert(tk.END, f"Error processing article: {str(e)}\n")
	print(f"Error processing article: {str(e)}")


	def browse_file(entry):
	filename = filedialog.askopenfilename()
	entry.delete(0, tk.END)
	entry.insert(0, filename)

	def browse_directory(entry):
	directory = filedialog.askdirectory()
	entry.delete(0, tk.END)
	entry.insert(0, directory)

	def decompress_file(xml_file):
	if xml_file.endswith('.bz2'):
	decompressed_file = xml_file[:-4] # Remove .bz2 extension
	with open(decompressed_file, 'wb') as f_out, bz2.BZ2File(xml_file, 'rb') as f_in:
	f_out.write(f_in.read())
	return decompressed_file
	return xml_file

	def convert_to_json(xml_file, json_dir, decompress, log_text):
	log_text.delete(1.0, tk.END) # Clear previous logs
	log_text.insert(tk.END, f"Starting conversion:\n -input: {xml_file}\n")
	log_text.insert(tk.END, f" -output: {json_dir}\n")
	log_text.update()
	log_text.insert(tk.END, "Decompressing...\n")
	log_text.update()

	xml_file = decompress_file(xml_file) if decompress else xml_file

	log_text.insert(tk.END, f"Decompressed: {xml_file}\n")
	log_text.update()

	log_text.insert(tk.END, "Conversion in progress...\n")
	log_text.update()

	process_file_text(xml_file, json_dir, log_text)

	log_text.insert(tk.END, "Conversion completed!\n")
	log_text.update()

	if decompress and xml_file.endswith('.bz2'):
	os.remove(xml_file)

	def open_link(url):
	webbrowser.open_new(url)

	def main():
	parser = argparse.ArgumentParser(description='Process Wikipedia XML dump file and save as text files.')
	parser.add_argument('xml_file', help='Wikipedia XML dump file (.xml or .xml.bz2)')
	parser.add_argument('json_dir', help='Directory to save the converted JSON files')
	args = parser.parse_args()

	window = tk.Tk()
	window.title("Wikipedia to text")

	# Widgets
	tk.Label(window, text="Wikipedia XML dump file:").grid(row=0, column=0, padx=5, pady=5)
	xml_entry = tk.Entry(window, width=45)
	xml_entry.grid(row=0, column=1, padx=5, pady=5)
	xml_entry.insert(0, args.xml_file)

	tk.Label(window, text="JSON save directory:").grid(row=1, column=0, padx=5, pady=5)
	json_entry = tk.Entry(window, width=45)
	json_entry.grid(row=1, column=1, padx=5, pady=5)
	json_entry.insert(0, args.json_dir)

	decompress_var = tk.BooleanVar()
	decompress_check = tk.Checkbutton(window, text="Decompress if .bz2", variable=decompress_var)
	decompress_check.grid(row=2, columnspan=3, padx=5, pady=5)

	log_text = scrolledtext.ScrolledText(window, width=70, height=13)
	log_text.grid(row=3, columnspan=3, padx=5, pady=5)

	convert_button = tk.Button(window, text="Convert", command=lambda: convert_to_json(xml_entry.get(), json_entry.get(), decompress_var.get(), log_text))
	convert_button.grid(row=4, columnspan=3, padx=5, pady=5)

	# Add links
	link_text = tk.Text(window, height=1, width=65)
	link_text.grid(row=5, columnspan=3, padx=5, pady=5)
	link_text.insert(tk.END, "Compressed Wikipedia dumps: ")
	link_text.tag_configure("link", foreground="blue", underline=True)
	link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/enwiki/20240120/"))
	link_text.insert(tk.END, "English (21GB)", "link")
	link_text.insert(tk.END, " \| ")
	link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/simplewiki/20240120/"))
	link_text.insert(tk.END, "Simple English (1GB)", "link")
	link_text.config(state=tk.DISABLED)

	window.mainloop()

	if __name__ == "__main__":
	main()