Last active
March 21, 2018 06:19
-
-
Save justin3737/787e90f03f277b54be34cb49ef76f020 to your computer and use it in GitHub Desktop.
CMD帶入引數 利用BS4爬出表格內容 並輸出CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import argparse | |
from bs4 import BeautifulSoup | |
#在CMD取得引數 | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-name") #<file_name.py> -name "html檔名" | |
args = parser.parse_args() | |
#讀取 HTML 靜態頁 | |
data_list = [] | |
file = codecs.open(args.name+".htm", 'r','utf-8') | |
soup = BeautifulSoup(file.read(),"html.parser") #使用BS4解析HTML | |
table = soup.find("tbody") #利用BS4找到Table | |
#取出表格內容 | |
for row in table.findAll("tr"): | |
cells = row.findAll("td") | |
data_list.append([ | |
cells[0].contents[0], | |
cells[1].contents[0], | |
cells[2].contents[0], | |
cells[3].contents[0], | |
cells[4].contents[0], | |
cells[5].contents[0] | |
]) | |
#輸出CSV | |
with open(args.name+".csv",'w') as file: #CSV檔名其實就是來源html的檔名 | |
for line in data_list: | |
for text in line: | |
file.write(text) | |
file.write(',') | |
file.write('\n') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment