Last active
June 20, 2023 00:21
-
-
Save Patrick-1994-/acd41c084790aa9d07499b9a76245891 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
Created on May 8, 2010 by @anasimtiaz | |
Original script URL: http://anasimtiaz.com/?p=51 | |
That link is dead, I suppose this is current: https://anasimtiaz.com/2014/03/08/wordpress-xml-splitter-again/ | |
Forked on May 28, 2016 by @danielwrobert | |
https://gist.github.com/danielwrobert/6c9ca8de8199d5430621f481673d4baa | |
Changed into a command-line script by @Patrick-1994- on Jan 24th 2022 | |
VERSION 3.0 | |
Added the -s to change the split size by @Patrick-1994- in 2023 | |
(also --help and --version,so now I have to give this a version number^^) | |
''' | |
import os | |
import sys | |
import re | |
DEFAULT_SPLIT_SIZE = 500000 | |
VERSION = "3.0" | |
#procInfo={}; | |
def writeHeader(currentFile): | |
header = '''<?xml version="1.0" encoding="UTF-8"?> | |
<rss version="2.0" | |
xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" | |
xmlns:content="http://purl.org/rss/1.0/modules/content/" | |
xmlns:wfw="http://wellformedweb.org/CommentAPI/" | |
xmlns:dc="http://purl.org/dc/elements/1.1/" | |
xmlns:wp="http://wordpress.org/export/1.2/" | |
> | |
<channel> | |
<wp:wxr_version>1.2</wp:wxr_version> | |
''' | |
currentFile.write(header); | |
def writeFooter(currentFile): | |
footer = ''' | |
</channel> | |
</rss>''' | |
currentFile.write(footer); | |
def errExit(): | |
sys.exit(1); | |
def err_print(s): | |
print(s, file=sys.stderr) | |
def startProc(input_file_path, outDir, split_size): | |
filePath,fileName = os.path.split(input_file_path) | |
fileNameTxt = os.path.splitext(fileName)[0]; | |
fileNameExt = os.path.splitext(fileName)[1]; | |
xmlFileObj = open(os.path.join(filePath,fileName), "r"); | |
xmlFile = xmlFileObj.read(); | |
totalCount = len(xmlFile); | |
iteration = 0; | |
currentCount = 0; | |
EOF = False; | |
while(EOF==False): | |
currentFileName = fileNameTxt + "_" + str(iteration) + fileNameExt; | |
currentFile = open(os.path.join(outDir,currentFileName), 'w'); | |
print( 'Writing file ' + currentFileName); | |
if iteration != 0: | |
writeHeader(currentFile); | |
if (currentCount+split_size) < totalCount: | |
# find index (?) of last </item> before split_size bytes are reached | |
xFile_i = xmlFile[currentCount:currentCount+split_size]; | |
incrFile = xFile_i.rfind('</item>') + len('</item>'); | |
currentFile.write(xFile_i[:incrFile]); | |
currentCount += incrFile; | |
else: | |
xFile_i = xmlFile[currentCount:]; | |
currentFile.write(xFile_i); | |
print('Finished processing \n'); | |
EOF = True; | |
if EOF != True: | |
writeFooter(currentFile); | |
iteration += 1; | |
def print_usage(): | |
err_print(f"usage: [ -s SPLIT_SIZE ] {ARGV0} input_file_path [outDir]\noutDir is the current working directory by default") | |
def print_version(): | |
print(VERSION) | |
if __name__ == '__main__': | |
split_size = DEFAULT_SPLIT_SIZE | |
ARGV0 = sys.argv[0] | |
command_line_args = sys.argv[1:] | |
if len(sys.argv) == 1: # no args | |
print_usage() | |
errExit() | |
if command_line_args[0] == "--help": | |
print_usage() | |
sys.exit(0) | |
if command_line_args[0] == "--version": | |
print_version() | |
sys.exit(0) | |
if command_line_args[0] == "-s": | |
if not command_line_args[1].isnumeric(): | |
err_print( "-s given but the arg is not numeric" ) | |
errExit() | |
split_size = int(command_line_args[1]) | |
command_line_args = command_line_args[2:] | |
input_file_path = command_line_args[0] | |
if( len(command_line_args) == 2 ): | |
outDir = command_line_args[1] | |
else: | |
outDir = os.path.abspath( os.path.realpath(".") ) | |
if not os.path.isfile(input_file_path): | |
err_print(f"""input file "{input_file_path}" does not exist""") | |
print_usage() | |
errExit() | |
if not os.path.isdir(outDir): | |
err_print(f"""output directory "{input_file_path}" does not exist""") | |
print_usage() | |
errExit() | |
startProc(input_file_path, outDir, split_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment