Skip to content

Instantly share code, notes, and snippets.

@nmfisher
Created February 3, 2019 08:42
Show Gist options
  • Save nmfisher/317c42f31d8ee3a4880e4b884b7b63d3 to your computer and use it in GitHub Desktop.
Save nmfisher/317c42f31d8ee3a4880e4b884b7b63d3 to your computer and use it in GitHub Desktop.
import urllib
from bs4 import BeautifulSoup
save_folder = "/home/hydroxide/data/legal/docracy/"
base_url = "http://www.docracy.com"
doc_urls = []
for i in range(1,110):
tag = "lease"
#list_page_url = base_url + "/doc/showalltagged?page=" + str(i) + "&tag=" + tag
#search_term = "\"consulting+services\""
#list_page_url = base_url + "/doc/search?sortBy=-1&page=" + str(i) + "&query="+search_term+"&s-submit=Submit"
list_page_url = base_url + "/doc/showall?page=" + str(i)
print(list_page_url)
list_page = BeautifulSoup(urllib.request.urlopen(list_page_url), 'html.parser')
doc_headings = list_page.find_all('h3', attrs={'class': 'normalheight'})
for doc_heading in doc_headings:
doc_urls.append(base_url + doc_heading.find("a")["href"])
download_urls = []
tags = []
for doc_url in doc_urls:
print(doc_url)
doc_page = BeautifulSoup(urllib.request.urlopen(doc_url), 'html.parser')
menu = doc_page.find('ul', attrs={'id': 'download-options'})
if menu is None:
continue
download_links = menu.findChildren("a");
href = None;
doc_tags = []
for child in download_links:
if "Word Doc" in child.get_text():
href = child["href"]
break
tag_menu = doc_page.find('ul', attrs={'id': 'tags'})
if tag_menu is None:
continue
doc_tags = [tag.get_text() for tag in tag_menu.findChildren("a")]
if href is not None:
download_urls.append(href)
print(href)
tags.append(doc_tags)
print(doc_tags)
else:
print("No Word link found")
import pickle
pickle.dump(download_urls, open( "download_urls.p", "wb" ))
pickle.dump(tags, open( "tags.p", "wb" ))
save_folder = "/virtualmachines/data/legal/docracy/"
for url in download_urls:
urllib.request.urlretrieve(base_url + url, save_folder + url.split("/")[-1] + ".doc")
with open(save_folder + "tags.txt", "w") as outfile:
for i in range(len(tags)):
row = download_urls[i].split("/")[-1] + ".doc" + "," + "|".join(tags[i])
print(row)
outfile.write(row + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment