Skip to content

Instantly share code, notes, and snippets.

@hemanth415
Last active April 16, 2019 18:09
Show Gist options
  • Save hemanth415/d2377c7d66de6cd26d6f55d1468da774 to your computer and use it in GitHub Desktop.
Save hemanth415/d2377c7d66de6cd26d6f55d1468da774 to your computer and use it in GitHub Desktop.
Python script to Extract Href's from the list pages.
from bs4 import BeautifulSoup
import urllib.request
a={}
a = set()
f = open("output.txt","w")
with open('url.txt') as lines:
for line in lines:
#print(''.join(['URL: ', line]))
resp = urllib.request.urlopen(line)
soup = BeautifulSoup(resp, from_encoding=resp.info().get_param('charset'),features='html.parser')
for link in soup.find_all('a', href=True):
#print(link.prettify())
b = ''.join([link['href'], ' -- ' , link.get_text(' ', strip=True)])
if b not in a:
a.add(b)
#print(''.join([line.rstrip(), ',', link['href'], ',' , link.get_text('', strip=True)]))
f.write(''.join([line.rstrip(), ',', link['href'], ',' , link.get_text('', strip=True),'\n']))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment