lsfalimis · June 5, 2014 10:21
diff --git a/my-1st-crawler-alt.py b/my-1st-crawler-alt.py
 import pprint, re, urllib, urllib2
 from bs4 import BeautifulSoup


 html = urllib2.urlopen('http://SOMEWEBSITE').read()
 soup = BeautifulSoup(html)
 stuff = soup(class_="WHATEVER")

 # not quite understand the following line, it will insert '\n'
 stuff.insert(0, stuff)

 f=open('/Users/henry/Desktop/output.txt', 'w')
 pprint.pprint(stuff, f)
 f.close()

 # sorry, I tried 'pprint.pformat' which returns a string, but after that, the loop won't loop lines
 f=open('/Users/henry/Desktop/output.txt', 'r')
 lines = f.readlines()[1:]
 for line in lines:
    a = re.search(r'http.*?jpg', line).group()
    b = re.search(r'".*?"', line).group().strip('"')+'.jpg'
    urllib.urlretrieve(a,b)
 f.close()
	import pprint, re, urllib, urllib2
	from bs4 import BeautifulSoup


	html = urllib2.urlopen('http://SOMEWEBSITE').read()
	soup = BeautifulSoup(html)
	stuff = soup(class_="WHATEVER")

	# not quite understand the following line, it will insert '\n'
	stuff.insert(0, stuff)

	f=open('/Users/henry/Desktop/output.txt', 'w')
	pprint.pprint(stuff, f)
	f.close()

	# sorry, I tried 'pprint.pformat' which returns a string, but after that, the loop won't loop lines
	f=open('/Users/henry/Desktop/output.txt', 'r')
	lines = f.readlines()[1:]
	for line in lines:
	a = re.search(r'http.*?jpg', line).group()
	b = re.search(r'".*?"', line).group().strip('"')+'.jpg'
	urllib.urlretrieve(a,b)
	f.close()