Skip to content

Instantly share code, notes, and snippets.

@rana-ahmed
Created June 7, 2015 17:39
Show Gist options
  • Select an option

  • Save rana-ahmed/e06ad6055427f5b32930 to your computer and use it in GitHub Desktop.

Select an option

Save rana-ahmed/e06ad6055427f5b32930 to your computer and use it in GitHub Desktop.
A simple web crawler for http://www.national500apps.com/
import csv
import requests
from bs4 import BeautifulSoup
def initDataFile():
with open('data.csv', 'wb') as fp:
writer = csv.writer(fp, delimiter = ',')
header = ['Name', 'Address', 'Email', 'Mobile No', 'University Name', 'Division Name', 'National ID No./Birth Registration No.', 'Date of Birth', 'Attend 5 days Training', '1st Preferred Training Venue', '2nd Preferred Training Venue', '3rd Preferred Training Venue', 'Status']
writer.writerows([header])
def parseData(content):
data = []
soup = BeautifulSoup(content)
rows = soup.find("table", id = "yw0").findAll("tr")
for row in rows:
data.append(row.find('td').get_text())
with open('data.csv', 'ab') as fp:
writer = csv.writer(fp, delimiter = ',')
writer.writerows([data])
def fetchData(url):
r = requests.get(url)
if r.status_code != 200:
return int(r.status_code)
content = r.text.encode('utf-8', 'ignore')
content = content.replace("\r", "")
content = content.replace("\n", "")
parseData(content)
return r.status_code
def main(url):
registrationId = 1
breakPoint = 0
logText = ''
initDataFile()
while breakPoint < 51:
print 'visiting user id ' + str(registrationId)
response = fetchData(url+str(registrationId))
if response == 200:
breakPoint = 0
else:
breakPoint += 1
logText = str(registrationId) + ' ,' + str(response) + '\n'
with open("error_log.txt", "a") as text_file:
text_file.write(logText)
registrationId += 1
print 'THE END'
if __name__ == "__main__":
print "Program Started"
url = 'http://www.national500apps.com/index.php?r=trainingTrainee/view&id='
main(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment