Skip to content

Instantly share code, notes, and snippets.

@snoop2head
Last active January 2, 2020 16:16
Show Gist options
  • Save snoop2head/6a3317e171432aa04aea2eaa554dda79 to your computer and use it in GitHub Desktop.
Save snoop2head/6a3317e171432aa04aea2eaa554dda79 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import csv
from urllib.parse import urlparse
def crwl_as_csv(univ_query):
page = 1
dummy_data1 = {}
df = pd.DataFrame(dummy_data1)
while page:
url = "https://oia.yonsei.ac.kr/partner/expReport.asp?page=" + str(page)+"&cur_pack=0&ucode="+str(univ_query)+"&bgbn=A"
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
df_crawl = pd.read_html(str(table),encoding='utf-8', header=0)[0]
df_crawl['href'] = [np.where(tag.has_attr('href'),tag.get('href'),"no link") for tag in table.find_all('a')]
if not df_crawl.empty:
page += 1
df = pd.concat([df, df_crawl],sort=False)
else:
print(df)
break
df_without_index = df.reset_index()
print(df_without_index)
df_without_index.to_csv(r'C:/Users/pc/Documents/GitHub/OIA_Text_Wrangling/dataf/'+univ_query+'.csv',index=False,encoding="utf-8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment