audhiaprilliant · April 20, 2022 01:28
diff --git a/zipfs_law_autostopwords_2.py b/zipfs_law_autostopwords_2.py
 # List of URL
 urls = [
    'https://www.gutenberg.org/files/1661/1661-0.txt',
    'https://www.gutenberg.org/files/2701/2701-0.txt',
    'https://www.gutenberg.org/files/11/11-0.txt',
    'https://www.gutenberg.org/files/98/98-0.txt',
    'https://www.gutenberg.org/files/74/74-0.txt'
 ]

 # Text
 text = ''

 # Loop URLs
 for url in urls:
    # Make a GET request for the story
    f = requests.get(url)

    # Character set encoding: UTF-8
    text_partial = f.content.decode(
        encoding = 'utf-8',
        errors = 'replace'
    )
    
    # Text preprocessing
    # Texts for the start and end of main content
    START = 'START OF THE PROJECT GUTENBERG EBOOK'
    END = 'END OF THE PROJECT GUTENBERG EBOOK'
    
    # Remove the start mark
    text_partial = re.findall(
        pattern = START + '.+\n([\s\S]+|[\d\D]+|[\w\W]+)',
        string = text_partial)[0]
    # Remove the end mark
    text_partial = re.findall(
        pattern = '([\s\S]+|[\d\D]+|[\w\W]+)' + END,
        string = text_partial)[0]
    
    # Append text
    text += text_partial
    
    # Status
    print('Access and preprocess >>> {url}'.format(
            url = url
        )
    )
	# List of URL
	urls = [
	'https://www.gutenberg.org/files/1661/1661-0.txt',
	'https://www.gutenberg.org/files/2701/2701-0.txt',
	'https://www.gutenberg.org/files/11/11-0.txt',
	'https://www.gutenberg.org/files/98/98-0.txt',
	'https://www.gutenberg.org/files/74/74-0.txt'
	]

	# Text
	text = ''

	# Loop URLs
	for url in urls:
	# Make a GET request for the story
	f = requests.get(url)

	# Character set encoding: UTF-8
	text_partial = f.content.decode(
	encoding = 'utf-8',
	errors = 'replace'
	)

	# Text preprocessing
	# Texts for the start and end of main content
	START = 'START OF THE PROJECT GUTENBERG EBOOK'
	END = 'END OF THE PROJECT GUTENBERG EBOOK'

	# Remove the start mark
	text_partial = re.findall(
	pattern = START + '.+\n([\s\S]+\|[\d\D]+\|[\w\W]+)',
	string = text_partial)[0]
	# Remove the end mark
	text_partial = re.findall(
	pattern = '([\s\S]+\|[\d\D]+\|[\w\W]+)' + END,
	string = text_partial)[0]

	# Append text
	text += text_partial

	# Status
	print('Access and preprocess >>> {url}'.format(
	url = url
	)
	)
No results found