ejmurray · August 3, 2015 14:58
diff --git a/dmdScrape_url.py b/dmdScrape_url.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 # there are a few erors where the script cannot return the correct number of lines
 # and the problem with utf-8 encoding
 # could use the code in scrapePubMed2.py to sort out the encoding issue?
 __author__ = 'Ernest'
 from bs4 import BeautifulSoup
 import urllib2
 import re

 url = "http://dmd.aspetjournals.org/content/43/3.toc"
 page = urllib2.urlopen(url)


 # open the html file that you have downloaded and say that the encoding is lxml
 soup_page = BeautifulSoup(page, "lxml", from_encoding="utf-8")
 # print(soup_page.prettify())

 # create a new file to save the results
 f = open("dmdTitles2.txt", "w")
 # f.writerow(["Title"])  # writes the first row with the heading "Title"

 # get the headings of the titles
 title_entries = soup_page.find_all("h4")
 # pdf_links_regex = re.compile(".*/content/")
 # first_link = soup_page.find(text=pdf_links_regex)
 # for fl in first_link:
 #     print(fl.string)

 # add each new line to the new text file using a loop
 for title in title_entries:
    # ' '.join(title())
    print(title.string)
    # f.write(str(title.string))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# there are a few erors where the script cannot return the correct number of lines
	# and the problem with utf-8 encoding
	# could use the code in scrapePubMed2.py to sort out the encoding issue?
	__author__ = 'Ernest'
	from bs4 import BeautifulSoup
	import urllib2
	import re

	url = "http://dmd.aspetjournals.org/content/43/3.toc"
	page = urllib2.urlopen(url)


	# open the html file that you have downloaded and say that the encoding is lxml
	soup_page = BeautifulSoup(page, "lxml", from_encoding="utf-8")
	# print(soup_page.prettify())

	# create a new file to save the results
	f = open("dmdTitles2.txt", "w")
	# f.writerow(["Title"]) # writes the first row with the heading "Title"

	# get the headings of the titles
	title_entries = soup_page.find_all("h4")
	# pdf_links_regex = re.compile(".*/content/")
	# first_link = soup_page.find(text=pdf_links_regex)
	# for fl in first_link:
	# print(fl.string)

	# add each new line to the new text file using a loop
	for title in title_entries:
	# ' '.join(title())
	print(title.string)
	# f.write(str(title.string))