Created
October 7, 2016 23:11
-
-
Save ihfazhillah/a2e5cdeb7ea106b37bee49b2c0da671c to your computer and use it in GitHub Desktop.
simple scraper untuk mendapatkan daftar surat, dan jumlah ayat. Return json file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
"""Scrape alquran surahs into json format""" | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
URL = "http://www.islamology.com/quran/fahres.asp" | |
def get_response(url=URL): | |
"""get reponse object""" | |
resp = requests.get(url) | |
return resp | |
def parse_response(response): | |
"""parse response data into dict object | |
dict(surah_no, surah_name, surah_juz, surah_page, | |
surah_ayat, surah_type)""" | |
soup = BeautifulSoup(response.content, 'html.parser') | |
fahrasah = soup.select("tbody.Common_TBODY > tr") | |
result = list() | |
for f in list(fahrasah): | |
data = f.select("td") | |
d = dict(surah_no=int(data[0].text), | |
surah_name=data[1].text.strip(), | |
surah_juz=data[2].text.strip(), | |
surah_page=int(data[3].text), | |
surah_ayat=int(data[4].text), | |
surah_type=data[5].text.strip()) | |
result.append(d) | |
return result | |
def dump_json(data, fn, indent=4): | |
"""dump data into json file""" | |
with open(fn, "w") as f: | |
json.dump(data, f, indent=indent, ensure_ascii=False) | |
if __name__ == '__main__': | |
rsp = get_response() | |
data = parse_response(rsp) | |
dump_json(data, "daftar_surat.json") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hasil