Skip to content

Instantly share code, notes, and snippets.

@haoliplus
Created May 11, 2019 11:09
Show Gist options
  • Save haoliplus/a863603960a8ee5f38d9a618d6a4d542 to your computer and use it in GitHub Desktop.
Save haoliplus/a863603960a8ee5f38d9a618d6a4d542 to your computer and use it in GitHub Desktop.
# update: 2019-5-11
import sys
import os
import time
from bs4 import BeautifulSoup
import requests
import pdfkit
def main(url):
output_path = ...
sys.path.append('/usr/local/bin/')
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h2').text.strip()
all_image = soup.find_all('img')
for img in all_image:
if img.has_attr('data-src'):
img['src'] = img['data-src']
host = 'mp.weixin.qq.com'
pathname = url.replace('https://', '').replace(host, '')
content = str(soup).replace('windows.location.href', url)\
.replace('location.href', url)\
.replace('windows.location.search', '')\
.replace('location.search', '')\
.replace('windows.location.protocol', 'https:')\
.replace('location.protocol', 'https:')\
.replace('windows.location.pathname', pathname)\
.replace('location.pathname', pathname)\
.replace('windows.location.host', host)\
.replace('location.host', host)\
.replace('"//res.wx.qq.com/', '"https://res.wx.qq.com/')
path = '{}/{}.wechat.html'.format(output_path, title)
open(path, 'w').write(content)
print(title)
query_text = ''.join(sys.argv[1:])
main(query_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment