Created
July 11, 2017 08:19
-
-
Save henryyang42/8a57a5ab30c4be0778d4f5b258ed413b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import time\n", | |
"import requests\n", | |
"import urllib\n", | |
"from bs4 import BeautifulSoup\n", | |
"import numpy" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = []\n", | |
"for genre in [33, 34, 35]:\n", | |
" for i in range(1, 12):\n", | |
" url = 'http://www.juben108.com/telescript_%d_%d_0/' % (genre, i)\n", | |
" res = requests.get(url, stream=True)\n", | |
" soup = BeautifulSoup(res.text, 'html.parser')\n", | |
" a.extend(soup.find_all(class_='hui'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"unique_a = numpy.unique(list(map(str, a)))\n", | |
"target_a = []\n", | |
"for kwd in ['鲜族兄弟', '古墓的杀机', '瞪眼等着好事来', '三月桃花雨', '阳光岁月']:\n", | |
" target_a.extend([a for a in unique_a if kwd in a])\n", | |
"s = '\\n'.join(target_a)\n", | |
"with open('urls.txt', 'w') as f:\n", | |
" f.write(s)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"128 scripts\n" | |
] | |
} | |
], | |
"source": [ | |
"with open('urls.txt') as f:\n", | |
" s = f.read()\n", | |
"soup = BeautifulSoup(s, 'html.parser')\n", | |
"\n", | |
"base_url = 'http://www.juben108.com'\n", | |
"urls = [(base_url+a['href'], a.text) for a in soup.find_all('a')]\n", | |
"print ('%d scripts' % len(urls))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"http://www.juben108.com/telescript_77940_1/ 电视连续剧文学剧本《鲜族兄弟》\n", | |
"http://www.juben108.com/telescript_77957_1/ 电视连续剧文学剧本《鲜族兄弟》第二集\n", | |
"http://www.juben108.com/telescript_77987_1/ 电视连续剧文学剧本《鲜族兄弟》第三集\t\t\n", | |
"http://www.juben108.com/telescript_78002_1/ 电视连续剧文学剧本《鲜族兄弟》第四集\n", | |
"http://www.juben108.com/telescript_78004_1/ 电视连续剧文学剧本《鲜族兄弟》第五集\t\n", | |
"http://www.juben108.com/telescript_78019_1/ 电视连续剧文学剧本《鲜族兄弟》第六集\n", | |
"http://www.juben108.com/telescript_78039_1/ 电视连续剧文学剧本《鲜族兄弟》第七集\t\n", | |
"http://www.juben108.com/telescript_78043_1/ 电视连续剧文学剧本《鲜族兄弟》第八集\n", | |
"http://www.juben108.com/telescript_78063_1/ 电视连续剧文学剧本《鲜族兄弟》第九集\n", | |
"http://www.juben108.com/telescript_78092_1/ 电视连续剧文学剧本《鲜族兄弟》第十集\n", | |
"http://www.juben108.com/telescript_78128_1/ 电视连续剧文学剧本《鲜族兄弟》第十一集\t\t\n", | |
"http://www.juben108.com/telescript_78153_1/ 电视连续剧文学剧本《鲜族兄弟》第十二集\n", | |
"http://www.juben108.com/telescript_78198_1/ 电视连续剧文学剧本《鲜族兄弟》第十三集\n", | |
"http://www.juben108.com/telescript_78224_1/ 电视连续剧文学剧本《鲜族兄弟》第十四集\n", | |
"http://www.juben108.com/telescript_78236_1/ 电视连续剧文学剧本《鲜族兄弟》第十五集\n", | |
"http://www.juben108.com/telescript_78298_1/ 电视连续剧文学剧本《鲜族兄弟》第十六集\n", | |
"http://www.juben108.com/telescript_78336_1/ 电视连续剧文学剧本《鲜族兄弟》第十六集\n", | |
"http://www.juben108.com/telescript_78345_1/ 电视连续剧文学剧本《鲜族兄弟》第十八集\n", | |
"http://www.juben108.com/telescript_78353_1/ 电视连续剧文学剧本《鲜族兄弟》第十九集\n", | |
"http://www.juben108.com/telescript_78356_1/ 电视连续剧文学剧本《鲜族兄弟》第二十集\n", | |
"http://www.juben108.com/telescript_78375_1/ 电视连续剧文学剧本《鲜族兄弟》第二十一集\n", | |
"http://www.juben108.com/telescript_78385_1/ 电视连续剧文学剧本《鲜族兄弟》第二十二集\t\t\n", | |
"http://www.juben108.com/telescript_78399_1/ 电视连续剧文学剧本《鲜族兄弟》第二十三集\n", | |
"http://www.juben108.com/telescript_78459_1/ 电视连续剧文学剧本《鲜族兄弟》第二十四集\n", | |
"http://www.juben108.com/telescript_78470_1/ 电视连续剧文学剧本《鲜族兄弟》第二十五集\n", | |
"http://www.juben108.com/telescript_78497_1/ 电视连续剧文学剧本《鲜族兄弟》第二十六集\n", | |
"http://www.juben108.com/telescript_78533_1/ 电视连续剧文学剧本《鲜族兄弟》第二十七集\n", | |
"http://www.juben108.com/telescript_78554_1/ 电视连续剧文学剧本《鲜族兄弟》第二十八集\n", | |
"http://www.juben108.com/telescript_78585_1/ 电视连续剧文学剧本《鲜族兄弟》第二十九集\n", | |
"http://www.juben108.com/telescript_78618_1/ 电视连续剧文学剧本《鲜族兄弟》第三十集\n", | |
"http://www.juben108.com/telescript_78639_1/ 电视连续剧文学剧本《鲜族兄弟》第三十一集\n" | |
] | |
} | |
], | |
"source": [ | |
"for url, title in urls:\n", | |
" print (url, title)\n", | |
" res = requests.get(url, stream=True)\n", | |
" soup = BeautifulSoup(res.text, 'html.parser')\n", | |
" for i in range(1, 5):\n", | |
" s = soup.find(id='contain_%d' % i)\n", | |
" with open('%s_%d.txt' % (title, i), 'w') as f:\n", | |
" f.write(str(s))\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"# 集數夠多且格式較好的劇本\n", | |
"\n", | |
"## 农村电视剧本\n", | |
"- 电视连续剧文学剧本《鲜族兄弟》第三十一集\n", | |
"- 古墓的杀机:第三十集\n", | |
"\n", | |
"## 都市电视剧本\n", | |
"- 瞪眼等着好事来(第二十集)\n", | |
"\n", | |
"## 偶像电视剧本\n", | |
"- 三月桃花雨:第二十一集(全剧终)\n", | |
"- 《阳光岁月》第二十五集\n", | |
"\n", | |
"## 剧本包含\n", | |
"- Location\n", | |
"- Time\n", | |
"- Scene description \n", | |
"- Speaker\n", | |
"\n", | |
"## Crawled data\n", | |
"- 128 scripts\n", | |
"- ~6MB data\n", | |
"\n", | |
"## example\n", | |
"```\n", | |
"<div id=\"contain_4\" style=\"font-size:14px; line-height:24px;\">\n", | |
"<p align=\"right\"> </p>\n", | |
"<p>第 五 集</p>\n", | |
"<p>1、 景阳镇“贵族”歌舞厅 傍晚</p>\n", | |
"<p><strong>[</strong>人物:韩东、晓华、马军、白鸽<strong>]</strong></p>\n", | |
"<p>[景阳街上的“贵族歌舞厅”,招牌上霓虹灯闪烁,门口的音箱播放着叮叮嘣嘣近似疯狂的音乐]</p>\n", | |
"<p style=\"MARGIN-LEFT: 29.1pt\"> </p>\n", | |
"<p>[韩东和晓华来到歌厅门口]</p>\n", | |
"<p>韩 东:“你等一下,我去买票。”</p>\n", | |
"<p>晓 华:“他们俩来了吗?”</p>\n", | |
"<p>韩 东:“我跟他们说好了,他们会来的,咱们先进去。”</p>\n", | |
"<p>[韩东去买了门票,和晓华一同走进歌厅]</p>\n", | |
"<p>[马军和白鸽已经先来了。在一个可坐四人的茶座里,马军和白鸽偎在一起坐着]</p>\n", | |
"<p> </p>\n", | |
"<p>[看见韩东和晓华进来,马军站起来招呼他们]</p>\n", | |
"<p>马 军:“韩东,晓华姐!”</p>\n", | |
"<p>[白鸽只活动了一下把身子坐正。在白炽灯下,白鸽的脸显得很白,涂着口红的唇、纹黑了的眉和淡淡的眼影使她的脸看去异常妩媚,刚烫洗过的秀发披散着]</p>\n", | |
"<p>[韩东和晓华在茶座坐下]</p>\n", | |
"<p style=\"MARGIN-LEFT: 28.9pt\"> </p>\n", | |
"<p>白 鸽:“晓华姐!你上哪儿去了,好几天没见?”</p>\n", | |
"<p>晓 华:“上哈尔滨去了。”</p>\n", | |
"<p>[马军给韩东和晓华面前各放一罐饮料]</p>\n", | |
"<p> </p>\n", | |
"```\n", | |
"# !\n", | |
"```\n", | |
"<p>马 军:“上哈尔滨干啥?”</p>\n", | |
"<p>晓 华:“散散心。那小子还上店里来吗?”</p>\n", | |
"<p>马 军:“谁呀?”</p>\n", | |
"<p>晓 华:“那个罗什么,罗川?”</p>\n", | |
"```\n", | |
"# !\n", | |
"```\n", | |
"<p>白 鸽:“你走了以后他又来过两天,后来知道你真不回来了,他没指望了,再也没来。”</p>\n", | |
"<p>马 军:“你走了好几天,咋也不跟小姨说一声,她肯定生气了吧?”</p>\n", | |
"<p>晓 华:“生气呗,我还生气呢,谁叫她整这事儿呢。”</p>\n", | |
"<p>白 鸽:“小姨还不是为你着想吗?”</p>\n", | |
"<p>晓 华:“得了吧。”</p>\n", | |
"<p>韩 东:“你们的公司开张了吗?”</p>\n", | |
"<p>白 鸽:“开啥张呀?他就礼拜天才来一下,平时也来不了,就我自个儿。”</p>\n", | |
"<p>马 军:“执照已经领了。”</p>\n", | |
"<p>白 鸽:“要我说,马军就别在那干了,带死不活的,混个啥劲呀!”</p>\n", | |
"<p>韩 东:“别,再怎么说那也是铁饭碗,旱涝保收。”</p>\n", | |
"<p>马 军:“我也是这么想。我给领导提建议了,印刷厂再买两台印刷机,彩印的,国庆节出特刊就用得 上,以后还能出广告赚钱。如果过了节真不出报了,印刷厂也就撤了,我再下来也不晚。以厂 收和晓华买了门票进去的时候,”</p>\n", | |
"<p>晓 华:“对,这样好。”</p>\n", | |
"<p>韩 东:“看来你们是真要单干了,可是生意咋样,好作吗?</p>\n", | |
"<p>白 鸽:“不好作,得顾三头,一头得买出来,一头得卖出去,中间还有个运输问题。运输也麻烦,先 \n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment