Skip to content

Instantly share code, notes, and snippets.

@kevingoldsmith
Created August 14, 2020 05:42
Show Gist options
  • Save kevingoldsmith/9e696de6f9b07536febfb4da22661be4 to your computer and use it in GitHub Desktop.
Save kevingoldsmith/9e696de6f9b07536febfb4da22661be4 to your computer and use it in GitHub Desktop.
Clean up the markdown code created from pelican-import from wordpress including downloading and re-adding all the images
import argparse
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import urlparse
import os
import re
def get_downloads_from_img(img):
return_list = []
src = img.get('src')
if src is not None:
parsed = urlparse(src)
filename = os.path.basename(parsed.path)
return_list.append((filename, src, src))
srcset = img.get('srcset')
if srcset is not None:
srcs = srcset.split(',')
for src in srcs:
original = src
src = src.strip()
src = src.split(' ')[0]
parsed = urlparse(src)
filename = os.path.basename(parsed.path)
return_list.append((filename, src, original))
return return_list
def find_url(possible_url):
finds = re.findall('(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\\".,<>?«»“”‘’]))', possible_url)
if len(finds) == 0:
return None
return finds[0][0]
def find_image_matches(url, file_paths):
return [(filename, src, original) for filename, src, original in file_paths if src == url]
def update_image_tag(img, file_paths, soup):
newimg = soup.new_tag('img')
SKIP_ATTRS = ['class', 'style']
for attr in img.attrs:
if attr not in SKIP_ATTRS:
if attr == 'src':
matches = find_image_matches(img.attrs[attr], file_paths)
newimg.attrs[attr] = '/articles/images/' + matches[0][0]
elif attr == 'srcset':
new_srcset = img.attrs[attr]
for path in file_paths:
orig_split = path[2].split(' ')
if len(orig_split) > 1:
new_srcset = new_srcset.replace(path[1], '/articles/images/'+path[0])
newimg.attrs[attr] = new_srcset
else:
newimg.attrs[attr] = img.attrs[attr]
return newimg
parser = argparse.ArgumentParser(description='cleanup a file imported to markdown from wordpress.xml using pelican-import')
parser.add_argument('markdownfile', type=argparse.FileType('r'))
parser.add_argument('wordpressxmlfile', type=argparse.FileType('r'))
parser.add_argument('--outputdirectory', default='', type=str)
parser.add_argument('--imagesdirectory', default='', type=str)
args = parser.parse_args()
outputfile = os.path.join(args.outputdirectory, os.path.basename(args.markdownfile.name))
imagepath = os.path.join(args.outputdirectory, args.imagesdirectory)
if args.outputdirectory:
os.makedirs(args.outputdirectory, exist_ok=True)
if args.imagesdirectory:
os.makedirs(imagepath, exist_ok=True)
lines = []
with args.markdownfile as f:
lines = f.readlines()
# get rid of the obvious stuff I don't want
newlines = []
extratags = []
for line in lines:
if line.startswith('Category: '):
category = line[10:]
commaloc = category.find(',')
if commaloc != -1:
onecategory = category[:commaloc]
print(f'WARNING: multiple categories found, selecting the first one: {onecategory}, and turing the rest to tags')
newlines.append(f'Category: {onecategory}\n')
extratags = category[commaloc:].split(',')
continue
if line.startswith('Tags: ') and len(extratags) > 0:
line = line.rstrip()
for tag in extratags:
tag = tag.strip()
if len(tag) > 0:
line += ', ' + tag
line += '\n'
newlines.append(line)
continue
if not (line.startswith('Author: ') or \
line.startswith('Status:') or \
line.startswith('`<!-- wp:paragraph') or \
line.startswith('`<!-- /wp:paragraph') or \
line.startswith('--')):
newlines.append(line)
# clean up ugly header gunk
newlines2 = []
inheading = False
title = ''
for line in newlines:
if line.startswith('Title: '):
# remove the metadata
title = line[7:].rstrip()
elif line.startswith('`<!-- wp:heading'):
inheading = True
continue
elif line.startswith('`<!-- /wp:heading'):
inheading = False
continue
elif inheading and len(line) == 1:
continue
if inheading:
if not line.startswith('#'):
newline = '# ' + line
else:
newline = line
newlines2.append(newline)
continue
newlines2.append(line)
# clean up redundant blank lines
newlines3 = []
lastline = ''
for line in newlines2:
if (line == lastline) and len(line) == 1:
continue
lastline = line
newlines3.append(line)
# clean up poor spaces in lists
newlines4 = []
inlist = False
for line in newlines3:
if line.startswith('`<!-- wp:list'):
inlist = True
continue
elif line.startswith('`<!-- /wp:list'):
inlist = False
continue
elif inlist and len(line) == 1:
continue
elif inlist and line.startswith('- '):
line = '- ' + line[4:]
newlines4.append(line)
original_url = ''
linkline = ''
blogtitle = ''
# parse the wordpress xml to get original link
tree = ET.parse(args.wordpressxmlfile.name)
root = tree.getroot()
item = root.find('./channel/title')
if item is not None:
blogtitle = item.text
for item in root.findall('./channel/item'):
title_element = item.find('./title')
if not ((title_element is not None) and title == title_element.text):
continue
link_element = item.find('./link')
if link_element is not None:
original_url = link_element.text
if blogtitle:
linkline = f'*Originally published on [{blogtitle}]({original_url})*\n'
else:
linkline = f'*Originally published at [{original_url}]({original_url})*\n'
break;
newlines4.append('\n')
newlines4.append(linkline)
newlines5 = []
# get the images
if original_url:
post_thumbnail_img = None
images = []
response = urllib.request.urlopen(original_url)
original_post = response.read()
soup = BeautifulSoup(original_post, features="lxml")
article = soup.find('article')
if article is not None:
thumbnail_div = article.find('div', {'class': 'post-thumbnail'})
if thumbnail_div is not None:
post_thumbnail_img = thumbnail_div.find('img')
else:
thumbail_a = article.find('a', {'class': 'post-thumnbnail'})
if thumbail_a is not None:
post_thumbnail_img = thumbnail_a.find('img')
entry_content_div = article.find('div', {'class': 'entry-content'})
if entry_content_div is not None:
images = entry_content_div.findAll('img')
# make a list of images to download
downloads = []
if post_thumbnail_img is not None:
downloads.extend(get_downloads_from_img(post_thumbnail_img))
for image in images:
downloads.extend(get_downloads_from_img(image))
# download the images
for download in downloads:
print(f'downloading: {download[1]}')
urllib.request.urlretrieve(download[1], os.path.join(imagepath, download[0]))
if post_thumbnail_img is not None:
# find the first non-metadata content line by finding the first line after an empty line
line_num = 0
for line in newlines4:
if len(line) == 1:
line_num += 1
break
line_num+=1
new_thumbnail = update_image_tag(post_thumbnail_img, downloads, soup)
newlines4.insert(line_num, str(new_thumbnail)+'\n')
newlines4.insert(line_num+1, '\n')
# clean up images
inimg = False
for line in newlines4:
if line.startswith('`<!-- wp:image'):
inimg = True
continue
elif line.startswith('`<!-- /wp:image'):
inimg = False
continue
elif inimg and len(line) == 1:
continue
elif inimg and line.startswith('!['):
imgurl = find_url(line)
if imgurl is None:
continue
# find the original image tag
for image in images:
if image['src'] == imgurl:
new_image = update_image_tag(image, downloads, soup)
newlines5.append(str(new_image)+'\n')
break
else:
newlines5.append(line)
else:
newlines5 = newlines4
with open(outputfile, 'w') as f:
f.writelines(newlines5)
@djsnipa1
Copy link

Thank you for this!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment