jecolasurdo · May 2, 2021 07:05
diff --git a/download_all_scripts.py b/download_all_scripts.py
 import os
 import requests
 import random

 from datetime import datetime

 from urllib.parse import quote
 from bs4 import BeautifulSoup

 BASE_URL = 'http://www.imsdb.com'
 SCRIPTS_DIR = 'scripts'


 def clean_script(text):
    text = text.replace('Back to IMSDb', '')
    text = text.replace('''<b><!--
 </b>if (window!= top)
 top.location.href=location.href
 <b>// -->
 </b>
 ''', '')
    text = text.replace('''          Scanned by http://freemoviescripts.com
          Formatting by http://simplyscripts.home.att.net
 ''', '')
    return text.replace(r'\r', '')


 def get_script(relative_link):
    tail = relative_link.split('/')[-1]
    script_front_url = BASE_URL + quote(relative_link)
    front_page_response = requests.get(script_front_url)
    front_soup = BeautifulSoup(front_page_response.text, "html.parser")

    try:
        script_link = front_soup.find_all('p', align="center")[0].a['href']
    except IndexError:
        print('%s has no script :(' % tail)
        return None, None

    if script_link.endswith('.html'):
        title = script_link.split('/')[-1].split(' Script')[0]
        script_url = BASE_URL + script_link
        script_soup = BeautifulSoup(
            requests.get(script_url).text, "html.parser")
        script_text = script_soup.find_all(
            'td', {'class': "scrtext"})[0].get_text()
        script_text = clean_script(script_text)
        return title, script_text
    else:
        print('%s is a pdf :(' % tail)
        return None, None


 if __name__ == "__main__":
    response = requests.get('http://www.imsdb.com/all-scripts.html')
    html = response.text

    soup = BeautifulSoup(html, "html.parser")
    paragraphs = soup.find_all('p')
    random.shuffle(paragraphs)

    n = 0
    for p in paragraphs:
        n += 1
        relative_link = p.a['href']
        title, script = get_script(relative_link)
        print("{}\t{:.2%}\t{} of {}\t{}".format(datetime.now(),
                                                n/len(paragraphs), n, len(paragraphs), title))
        if not script:
            continue

        with open(os.path.join(SCRIPTS_DIR, title.strip('.html') + '.txt'), 'w', encoding='utf-8') as outfile:
            outfile.write(script)
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4==4.9.3
 requests==2.25.1
	import os
	import requests
	import random

	from datetime import datetime

	from urllib.parse import quote
	from bs4 import BeautifulSoup

	BASE_URL = 'http://www.imsdb.com'
	SCRIPTS_DIR = 'scripts'


	def clean_script(text):
	text = text.replace('Back to IMSDb', '')
	text = text.replace('''<b><!--
	</b>if (window!= top)
	top.location.href=location.href
	<b>// -->
	</b>
	''', '')
	text = text.replace(''' Scanned by http://freemoviescripts.com
	Formatting by http://simplyscripts.home.att.net
	''', '')
	return text.replace(r'\r', '')


	def get_script(relative_link):
	tail = relative_link.split('/')[-1]
	script_front_url = BASE_URL + quote(relative_link)
	front_page_response = requests.get(script_front_url)
	front_soup = BeautifulSoup(front_page_response.text, "html.parser")

	try:
	script_link = front_soup.find_all('p', align="center")[0].a['href']
	except IndexError:
	print('%s has no script :(' % tail)
	return None, None

	if script_link.endswith('.html'):
	title = script_link.split('/')[-1].split(' Script')[0]
	script_url = BASE_URL + script_link
	script_soup = BeautifulSoup(
	requests.get(script_url).text, "html.parser")
	script_text = script_soup.find_all(
	'td', {'class': "scrtext"})[0].get_text()
	script_text = clean_script(script_text)
	return title, script_text
	else:
	print('%s is a pdf :(' % tail)
	return None, None


	if __name__ == "__main__":
	response = requests.get('http://www.imsdb.com/all-scripts.html')
	html = response.text

	soup = BeautifulSoup(html, "html.parser")
	paragraphs = soup.find_all('p')
	random.shuffle(paragraphs)

	n = 0
	for p in paragraphs:
	n += 1
	relative_link = p.a['href']
	title, script = get_script(relative_link)
	print("{}\t{:.2%}\t{} of {}\t{}".format(datetime.now(),
	n/len(paragraphs), n, len(paragraphs), title))
	if not script:
	continue

	with open(os.path.join(SCRIPTS_DIR, title.strip('.html') + '.txt'), 'w', encoding='utf-8') as outfile:
	outfile.write(script)