Created
October 19, 2017 19:58
-
-
Save tdpearson/d736bb12d9aa2a1a8bfc620b4716da19 to your computer and use it in GitHub Desktop.
Example of scraping a site that requires form submission
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| # Iterate over author pages | |
| for page in range(1, 2): #TODO: update to number of pages + 1 (i.e. 15 pages would be 16) | |
| data = {"gruppo": "autori", | |
| "iniziale": "all", | |
| "pag": page} | |
| response = requests.post("http://digiliblt.lett.unipmn.it/testi.php", data=data) | |
| soup = BeautifulSoup(response.text, "lxml") | |
| # Look for all the links to authors | |
| for link in soup.find_all("a", {"href": "#", "onclick": True}): | |
| link_val = link['onclick'] | |
| # Get the author ids | |
| if "scheda_autori" in link_val.lower(): | |
| author_id = link_val.split("'")[1] | |
| data = {"id": author_id, | |
| "bpag": "1"} | |
| # Load author page based off of author id | |
| response = requests.post("http://digiliblt.lett.unipmn.it/autore.php", data=data) | |
| # TODO: parse the author page | |
| print(response.text) | |
| exit(0) | |
| # This was the example of scraping the packhum site. | |
| loc = "http://latin.packhum.org/canon" | |
| html = requests.get(loc).text | |
| soup = BeautifulSoup(html, "lxml") | |
| anam_spans = soup.find_all("span", {'id': 'bolded'}) | |
| for span in anam_spans: | |
| print(span.span.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment