Last active
November 27, 2018 22:11
-
-
Save 5shekel/a8d39c193bf6a2eec0516ebeb07d8f7a to your computer and use it in GitHub Desktop.
get yo links
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#script guidance from John G. Fisher | |
# check his youtube https://www.youtube.com/watch?v=e6xZAISu-5E | |
# grab all links we talked about | |
from bs4 import BeautifulSoup | |
import requests | |
import re | |
#use 'python http.server' on the directory to serve the folder. sb4 doesnt do local links | |
inbox_path='http://localhost:8000/messages/inbox/ElBootkeh_64b09d1ba5/message.html' | |
print(inbox_path) | |
url = requests.get(inbox_path) | |
soup = BeautifulSoup(url.text, 'html.parser') | |
with open('idiot_links.txt', 'w') as f: | |
for link in soup.find_all(href=True): | |
try: | |
#grab href see> https://stackoverflow.com/q/5815747/184085 | |
link=link['href'] | |
print(link) | |
# afterwards pass .html\?.*$ into .html (in vscode to fomrat long aliexpress links...) | |
link=re.sub(r'.html\?.*$',r'.html',link) | |
#TBD remove stickers messages with '^messages/stickers_used.*png' | |
#TBD shorten deep dirs | |
#TBD remove dups 'awk '!seen[$0]++' idiot_links.txt > idiot_linksU.txt' | |
#TBD reverse order | |
#TBD add {{currly bractes}} to media | |
#TBD extract date of event | |
#TBD auto uplaod relevant files to wiki | |
f.write(link.lstrip() + '\n\n') | |
except: | |
TypeError | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment