Last active
November 8, 2024 22:33
-
-
Save thunderpoot/58a748565d2e5b2582520fa535821908 to your computer and use it in GitHub Desktop.
An example of fetching a page from Common Crawl using the Common Crawl Index
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
# For parsing URLs: | |
from urllib.parse import quote_plus | |
# For parsing WARC records: | |
from warcio.archiveiterator import ArchiveIterator | |
# The URL of the Common Crawl Index server | |
SERVER = 'http://index.commoncrawl.org/' | |
# The Common Crawl index you want to query | |
INDEX_NAME = 'CC-MAIN-2024-33' # Replace with the latest index name | |
# The URL you want to look up in the Common Crawl index | |
target_url = 'commoncrawl.org/faq' # Replace with your target URL | |
# It’s advisable to use a descriptive User-Agent string when developing your own applications. | |
# This practice aligns with the conventions outlined in RFC 7231. Let's use this simple one: | |
myagent = 'cc-get-started/1.0 (Example data retrieval script; [email protected])' | |
# Function to search the Common Crawl Index | |
def search_cc_index(url): | |
encoded_url = quote_plus(url) | |
index_url = f'{SERVER}{INDEX_NAME}-index?url={encoded_url}&output=json' | |
response = requests.get(index_url, headers={'user-agent': myagent}) | |
print("Response from server:\r\n", response.text) | |
if response.status_code == 200: | |
records = response.text.strip().split('\n') | |
return [json.loads(record) for record in records] | |
else: | |
return None | |
# Function to fetch content from Common Crawl | |
def fetch_page_from_cc(records): | |
for record in records: | |
offset, length = int(record['offset']), int(record['length']) | |
s3_url = f'https://data.commoncrawl.org/{record["filename"]}' | |
# Define the byte range for the request | |
byte_range = f'bytes={offset}-{offset+length-1}' | |
# Send the HTTP GET request to the S3 URL with the specified byte range | |
response = requests.get( | |
s3_url, | |
headers={'user-agent': myagent, 'Range': byte_range}, | |
stream=True | |
) | |
if response.status_code == 206: | |
# Use `stream=True` in the call to `requests.get()` to get a raw | |
# byte stream, because it's gzip compressed data | |
# Create an `ArchiveIterator` object directly from `response.raw` | |
# which handles the gzipped WARC content | |
stream = ArchiveIterator(response.raw) | |
for warc_record in stream: | |
if warc_record.rec_type == 'response': | |
return warc_record.content_stream().read() | |
else: | |
print(f"Failed to fetch data: {response.status_code}") | |
return None | |
print("No valid WARC record found in the given records") | |
return None | |
# Search the index for the target URL | |
records = search_cc_index(target_url) | |
if records: | |
print(f"Found {len(records)} records for {target_url}") | |
# Fetch the page content from the first record | |
content = fetch_page_from_cc(records) | |
if content: | |
print(f"Successfully fetched content for {target_url}") | |
# You can now process the 'content' variable as needed | |
# using something like Beautiful Soup, etc | |
else: | |
print(f"No records found for {target_url}") |
how would one construct the record object inline on line #37 of the example code?
how would one construct the record object inline on line #37 of the example code?
# Of course import `ArchiveIterator` from `warcio`...
from warcio.archiveiterator import ArchiveIterator
# Use `stream=True` in the call to `requests.get()` to get a raw byte stream
# because it's gzip compressed data...
response = requests.get(
s3_url,
headers={'Range': f'bytes={offset}-{offset+length-1}'},
stream=True
)
# Create an `ArchiveIterator` object directly from `response.raw`
# which handles the gzipped WARC content...
# Iterate through the WARC records, looking for a 'response' type record and
# when hitting a 'response' record we return the content
# using `warc_record.content_stream().read()` like so...
if response.status_code == 206:
stream = ArchiveIterator(response.raw)
for warc_record in stream:
if warc_record.rec_type == 'response':
return warc_record.content_stream().read()
else:
print(f"Failed to fetch data: {response.status_code}")
Hope this helps!
Thanks. I tried this, but it doesn't find any warc_record
in the stream
.
Sorry, never mind. I forgot to add stream=True
.
Thanks for your help!
Sorry, never mind. I forgot to add
stream=True
.Thanks for your help!
Great, glad to hear it!
I've updated this gist to include the example in the comment above, since it's a very commonly-asked question.
That’s great! Than you.
…On Tue, Aug 27, 2024 at 11:02 AM underwood ***@***.***> wrote:
***@***.**** commented on this gist.
------------------------------
I've updated this gist to include the example in the comment above, since
it's a very commonly-asked question.
—
Reply to this email directly, view it on GitHub
<https://gist.github.com/thunderpoot/58a748565d2e5b2582520fa535821908#gistcomment-5169476>
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AZPQYAPCBQP2IUYEPLS5UGLZTS5K3BFKMF2HI4TJMJ2XIZLTSKBKK5TBNR2WLJDUOJ2WLJDOMFWWLO3UNBZGKYLEL5YGC4TUNFRWS4DBNZ2F6YLDORUXM2LUPGBKK5TBNR2WLJDHNFZXJJDOMFWWLK3UNBZGKYLEL52HS4DFVRZXKYTKMVRXIX3UPFYGLK2HNFZXIQ3PNVWWK3TUUZ2G64DJMNZZDAVEOR4XAZNEM5UXG5FFOZQWY5LFVEYTENRQG42TOMJTU52HE2LHM5SXFJTDOJSWC5DF>
.
You are receiving this email because you commented on the thread.
Triage notifications on the go with GitHub Mobile for iOS
<https://apps.apple.com/app/apple-store/id1477376905?ct=notification-email&mt=8&pt=524675>
or Android
<https://play.google.com/store/apps/details?id=com.github.android&referrer=utm_campaign%3Dnotification-email%26utm_medium%3Demail%26utm_source%3Dgithub>
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's an example of how to use
warcio
to parse a WARC record:For more info refer to the
warcio
documentation