Skip to content

Instantly share code, notes, and snippets.

@jecolasurdo
Forked from nchibana/download_all_scripts.py
Last active May 2, 2021 07:05
Show Gist options
  • Save jecolasurdo/a64f1410a38cc6e6288ebec6e37f0f08 to your computer and use it in GitHub Desktop.
Save jecolasurdo/a64f1410a38cc6e6288ebec6e37f0f08 to your computer and use it in GitHub Desktop.
Scrape IMSDB movie scripts
import os
import requests
import random
from datetime import datetime
from urllib.parse import quote
from bs4 import BeautifulSoup
BASE_URL = 'http://www.imsdb.com'
SCRIPTS_DIR = 'scripts'
def clean_script(text):
text = text.replace('Back to IMSDb', '')
text = text.replace('''<b><!--
</b>if (window!= top)
top.location.href=location.href
<b>// -->
</b>
''', '')
text = text.replace(''' Scanned by http://freemoviescripts.com
Formatting by http://simplyscripts.home.att.net
''', '')
return text.replace(r'\r', '')
def get_script(relative_link):
tail = relative_link.split('/')[-1]
script_front_url = BASE_URL + quote(relative_link)
front_page_response = requests.get(script_front_url)
front_soup = BeautifulSoup(front_page_response.text, "html.parser")
try:
script_link = front_soup.find_all('p', align="center")[0].a['href']
except IndexError:
print('%s has no script :(' % tail)
return None, None
if script_link.endswith('.html'):
title = script_link.split('/')[-1].split(' Script')[0]
script_url = BASE_URL + script_link
script_soup = BeautifulSoup(
requests.get(script_url).text, "html.parser")
script_text = script_soup.find_all(
'td', {'class': "scrtext"})[0].get_text()
script_text = clean_script(script_text)
return title, script_text
else:
print('%s is a pdf :(' % tail)
return None, None
if __name__ == "__main__":
response = requests.get('http://www.imsdb.com/all-scripts.html')
html = response.text
soup = BeautifulSoup(html, "html.parser")
paragraphs = soup.find_all('p')
random.shuffle(paragraphs)
n = 0
for p in paragraphs:
n += 1
relative_link = p.a['href']
title, script = get_script(relative_link)
print("{}\t{:.2%}\t{} of {}\t{}".format(datetime.now(),
n/len(paragraphs), n, len(paragraphs), title))
if not script:
continue
with open(os.path.join(SCRIPTS_DIR, title.strip('.html') + '.txt'), 'w', encoding='utf-8') as outfile:
outfile.write(script)
beautifulsoup4==4.9.3
requests==2.25.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment