Created
November 5, 2020 20:17
-
-
Save boxabirds/72121ba51004a91a4dce107f5eca40f9 to your computer and use it in GitHub Desktop.
Scrapinghub.com when given a non-HTML URL (specifically a PDF) barfs and it's unclear how to detect this short of hacking the text (which could change)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DEBUG urllib3.connectionpool:connectionpool.py:442 https://autoextract.scrapinghub.com:443 "POST /v1/extract HTTP/1.1" 200 None | |
DEBUG root:article.py:247 Download time took 3.2365484610000004s | |
DEBUG root:article.py:249 { | |
"query": { | |
"id": "1604607287688-bc624d2ae5f0dbd1", | |
"domain": "netlify.app", | |
"userQuery": { | |
"url": "https://lisn-tests.netlify.app/pdf.pdf", | |
"pageType": "article" | |
} | |
}, | |
"webPage": { | |
"inLanguages": [ | |
{ | |
"code": "en" | |
} | |
] | |
}, | |
"article": { | |
"articleBody": "https://lisn-tests.netlify.app/pdf.pdf", | |
"articleBodyRaw": "<body>\n <h1>Failed loading page (Frame load interrupted by policy change)</h1>\n <h2>https://lisn-tests.netlify.app/pdf.pdf</h2>\n <p>WebKit error #102</p>\n </body>", | |
"articleBodyHtml": "<article>\n\n<h2>https://lisn-tests.netlify.app/pdf.pdf</h2>\n\n</article>", | |
"headline": "Failed loading page (Frame load interrupted by policy change)", | |
"datePublishedRaw": "WebKit error #102", | |
"url": "https://lisn-tests.netlify.app/pdf.pdf", | |
"probability": 0.0014535423 | |
}, | |
"algorithmVersion": "20.10.1" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment