Last active
October 22, 2016 23:28
-
-
Save Mahdisadjadi/13e10eb2b69facac5ae617e70d79dc5b to your computer and use it in GitHub Desktop.
To remove ".html" from all links within html pages - I wrote this to fix this issue: https://github.com/emckiernan/whyopenresearch/issues/5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs # to read html | |
from bs4 import BeautifulSoup # to parse html | |
import glob # to find all files with a pattern | |
pages = glob.glob("*.html") | |
for page in pages: | |
# loop through all files | |
name = page.split(".")[0] | |
print (name) | |
html=codecs.open(page, 'r') # read html | |
text = html.read() | |
soup = BeautifulSoup(text,"html.parser") # parse html | |
for link in soup.findAll('a'): | |
link['href'] = link['href'].replace(".html", "") # find .html and replace it with null string | |
html = soup.prettify() # re-style the output | |
with open(name+".html", "w") as f: | |
f.write(html) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment