Created
January 9, 2023 21:21
-
-
Save GlinZachariah/dd97c1b0d83397548a7b40a07cb183d0 to your computer and use it in GitHub Desktop.
Generate movie data as json from Wikipedia (Base Layout) further improvements are welcome.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Generate movie data as json from Wikipedia | |
''' | |
import requests | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import time | |
import re | |
import json | |
#url of the page we want to scrape | |
class MovieDataFetcher: | |
url=''; | |
name=''; | |
html=''; | |
page=''; | |
def __init__(self,urlX): | |
global url; | |
url = urlX | |
print("Hello World"); | |
self.loadDriver(); | |
response = self.processPage(); | |
return response; | |
def loadDriver(self): | |
global url,html; | |
print("Loading Drivers..."); | |
# url = "https://www.wikidata.org/wiki/Q73028" | |
# initiating the webdriver. Parameter includes the path of the webdriver. | |
driver = webdriver.Chrome('./chromedriver') | |
driver.get(url) | |
# this is just to ensure that the page is loaded | |
time.sleep(5) | |
html = driver.page_source | |
driver.close(); | |
print("Page fetching.."); | |
def processPage(self): | |
time.sleep(2) | |
print("Processing page.."); | |
global html,name,page; | |
page = requests.get(url); | |
soup = BeautifulSoup(page.content, "html.parser") | |
# taking the page content directly | |
title_res = soup.find(id="content") | |
title = title_res.find_all("span",class_="wikibase-title-label") | |
print("Movie :: "+title[0].text) | |
results = soup.find(id="mw-content-text") | |
# wikidata is split as list of key value pair based on wikidata properties | |
propGrps = results.find_all("div",class_="wikibase-statementgroupview") | |
td_tag_list =[]; | |
# iterating through each property | |
for prop in propGrps: | |
temp = prop.find_all("a") | |
temp_list=[]; | |
# each property contains first value as property name and remaining as values | |
for i in range(0,len (temp),1): | |
temp_val =temp[i].text; | |
# some of the data present is invalid and only required for Wikipedia | |
if re.search(r'(.*?)Wikimedia(.*?)|(.+?)Wikipedia|inferred from|retrieved|(.*?)URL|Category:(.*?)',temp_val) is None: | |
temp_list.append(temp_val); | |
# combining all properties to a list | |
td_tag_list.append(temp_list); | |
dicts = {} | |
for i in range(0,len(td_tag_list),1): | |
key = td_tag_list[i].pop(0); | |
value = td_tag_list[i]; | |
dicts[key] = value; | |
# print(dicts) | |
jsonStr = json.dumps(dicts,ensure_ascii=False) | |
print(jsonStr) | |
with open(title[0].text.replace(' ','_')+'.json', 'w', encoding='utf-8') as f: | |
json.dump(dicts, f, ensure_ascii=False) | |
print("Saved successfully in "+title[0].text.replace(' ','_')+'.json') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment