Last active
September 14, 2021 19:55
-
-
Save KorigamiK/d363fbcfd0c25b6df9d267672996aa62 to your computer and use it in GitHub Desktop.
Download jee advance paper solutions from VMC and also makes a crawljob for jdownloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from aiohttp import ClientSession as c | |
import asyncio | |
from bs4 import BeautifulSoup as bs | |
import json | |
template = "https://www.vidyamandir.com/jeeadvanced20{}.html" | |
template_2 = "https://www.vidyamandir.com/jeeadv20{}.html" | |
unnecessary = re.compile(r"\sfor.+dvanced\s(?=\d)") | |
async def main(): | |
data = {} | |
async with c() as session: | |
async def get_link(url: str, year: int) -> str: | |
async with session.get(url) as resp: | |
soup = bs(await resp.text(), "html.parser") | |
data[year] = row = {} | |
for i in soup.findAll("tr"): | |
try: | |
name = unnecessary.sub(" ", i.find("td").text).strip().replace('Detailed Solutions and Questions', 'Solutions') | |
row['All ' + name if 'utions' in name else name] = i.find("a").get("href") | |
except AttributeError: | |
continue | |
if data[year] == {}: | |
del data[year] | |
tasks = [] | |
for i in range(15, 21): | |
tasks.append(get_link(template.format(i) if i<=17 else template_2.format(i), i)) | |
await asyncio.gather(*tasks) | |
with open("data.json", "w") as file: | |
json.dump(data, file, indent=4) | |
new_data = [] | |
with open('data.crawljob', 'w') as file: | |
for i in data.values(): | |
for k, v in i.items(): | |
row = { | |
'text':v, | |
'filename':f'{k}.pdf', | |
# "autoConfirm": "TRUE", | |
# "autoStart": "TRUE", | |
"downloadFolder": "D:\\Documents\\Solutions\\pyq\\adv" | |
} | |
new_data.append(row) | |
json.dump(new_data, file) | |
asyncio.get_event_loop().run_until_complete(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment