Created
November 21, 2019 02:07
-
-
Save drbh/688977367f6337c69113237afcaf88e2 to your computer and use it in GitHub Desktop.
Scrape - parse and save the city of phoenix's FAQ questions to a CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import pandas as pd | |
# function to remove non-ASCII | |
def remove_non_ascii(text): | |
return ''.join(i for i in text if ord(i)<128) | |
def extract_question_text(s): | |
html_tree = BeautifulSoup(s, "lxml").find("div") | |
if html_tree is not None: | |
return remove_non_ascii( html_tree.get_text() ) | |
return "-" | |
url = "https://www.phoenix.gov/faq" | |
r = requests.get(url = url) | |
html = r.content | |
soup = BeautifulSoup(html, "lxml") | |
table = soup.find('table') | |
df = pd.read_html(str(table)) | |
frame = df[0] | |
frame = frame[[2,3]] | |
frame.columns = ["question", "answer"] | |
temp = frame["answer"].apply(extract_question_text) | |
frame.loc[:,"answer"] = temp | |
frame.to_csv("~/Desktop/phoenix_gov_faq.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment