Created
August 3, 2015 14:58
-
-
Save ejmurray/5fdc3f2f74ccff9cd46a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# there are a few erors where the script cannot return the correct number of lines | |
# and the problem with utf-8 encoding | |
# could use the code in scrapePubMed2.py to sort out the encoding issue? | |
__author__ = 'Ernest' | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import re | |
url = "http://dmd.aspetjournals.org/content/43/3.toc" | |
page = urllib2.urlopen(url) | |
# open the html file that you have downloaded and say that the encoding is lxml | |
soup_page = BeautifulSoup(page, "lxml", from_encoding="utf-8") | |
# print(soup_page.prettify()) | |
# create a new file to save the results | |
f = open("dmdTitles2.txt", "w") | |
# f.writerow(["Title"]) # writes the first row with the heading "Title" | |
# get the headings of the titles | |
title_entries = soup_page.find_all("h4") | |
# pdf_links_regex = re.compile(".*/content/") | |
# first_link = soup_page.find(text=pdf_links_regex) | |
# for fl in first_link: | |
# print(fl.string) | |
# add each new line to the new text file using a loop | |
for title in title_entries: | |
# ' '.join(title()) | |
print(title.string) | |
# f.write(str(title.string)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment