Created
April 2, 2018 01:02
-
-
Save PhirePhly/e80e507a5f669d1b8b07c0ebdaa3c68f to your computer and use it in GitHub Desktop.
Adding caching to dedupe queries for Newgrabber
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/dedupe.py b/dedupe.py | |
index 26230a6..1e346ff 100644 | |
--- a/dedupe.py | |
+++ b/dedupe.py | |
@@ -11,6 +11,10 @@ import warcio | |
from warcio.archiveiterator import ArchiveIterator | |
from warcio.warcwriter import WARCWriter | |
+proxies = { | |
+ 'http': 'http://127.0.0.1:8080', | |
+} | |
+ | |
if not warcio.__file__ == os.path.join(os.getcwd(), 'warcio', '__init__.pyc'): | |
print('Warcio was not imported correctly.') | |
print('Location: ' + warcio.__file__ + '.') | |
@@ -27,7 +31,7 @@ def ia_available(url, digest): | |
try: | |
tries += 1 | |
ia_data = requests.get('http://NewsGrabberDedupe.b-cdn.net/{hashed}' \ | |
- .format(hashed=hashed), timeout=60) | |
+ .format(hashed=hashed), timeout=60, proxies=proxies) | |
if not ';' in ia_data.text: | |
return False | |
return ia_data.text.split(';', 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment