Created
June 22, 2017 16:51
-
-
Save benagricola/0c22eeb64a94a66d3fe3b16e7a0cd7e7 to your computer and use it in GitHub Desktop.
One-File, redistributable Scrapy based Crawler, using pyinstaller. Generate binary using pyinstaller scrape.spec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyInstaller.utils.hooks import collect_submodules, collect_data_files | |
# This hooks the scrapy project 'cot' to import all submodules, change name to match scrapy project | |
hiddenimports = (collect_submodules('cot')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PyInstaller.utils.hooks import collect_submodules, collect_data_files | |
# This collects all dynamically imported scrapy modules and data files. | |
hiddenimports = (collect_submodules('scrapy') + | |
collect_submodules('scrapy.pipelines') + | |
collect_submodules('scrapy.extensions') + | |
collect_submodules('scrapy.utils') + | |
collect_submodules('cot') | |
) | |
datas = collect_data_files('scrapy') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import json, sys | |
import scrapy, argparse | |
from scrapy.settings import Settings | |
from scrapy.crawler import CrawlerProcess | |
from cot.spiders.cot import CotSpider | |
from cot.utils import CotEncoder | |
def main(): | |
parser = argparse.ArgumentParser(description='Website Crawler') | |
parser.add_argument('--domain', type=str, choices=['test.com'], required=True, help='Sets domain to crawl') | |
parser.add_argument('--mode', type=str, choices=['incremental', 'full'], nargs='?', default='incremental', help='Sets run mode. Incremental must be run on an existing directory') | |
parser.add_argument('--output', type=str, choices=['offline', 'public'], nargs='?', default='public', help='Sets output type for either the public or offline') | |
parser.add_argument('--print', action='store_true', dest='do_print', help='Prints resolved crawler settings and exits') | |
parser.add_argument('--arg', type=lambda kv: kv.split("="), dest='args', nargs='?', action='append', help='Specify multiple times with key=value arguments to override settings') | |
args = parser.parse_args() | |
settings_obj = Settings() | |
if args.output == 'offline': | |
from cot import offline_settings as settings | |
else: | |
from cot import public_settings as settings | |
settings_obj.setmodule(settings, priority='project') | |
if args.args: | |
settings_obj.setdict(dict(args.args), priority='cmdline') | |
# Force the CRAWL_TYPE setting | |
settings_obj.set('CRAWL_TYPE', args.mode, priority=settings_obj.maxpriority()) | |
settings_obj.set('DOMAIN', args.domain, priority=settings_obj.maxpriority()) | |
if args.do_print: | |
try: | |
print(json.dumps(dict(settings_obj), indent=4, separators=(',', ': '), sort_keys=True, cls=CotEncoder)) | |
except Exception, e: | |
print('Unable to print resolved settings: %s' % (str(e))) | |
sys.exit(1) | |
return | |
process = CrawlerProcess(settings=settings_obj) | |
process.crawl(CotSpider) | |
process.start() | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
block_cipher = None | |
options = [] | |
a = Analysis(['scrape.py'], | |
pathex=['/opt/cot-scraper'], | |
binaries=[], | |
datas=[], | |
hiddenimports=[], | |
hookspath=['hooks'], | |
runtime_hooks=[], | |
excludes=[], | |
win_no_prefer_redirects=False, | |
win_private_assemblies=False, | |
cipher=block_cipher) | |
pyz = PYZ(a.pure, a.zipped_data, | |
cipher=block_cipher) | |
exe = EXE(pyz, | |
a.scripts, | |
a.binaries, | |
a.zipfiles, | |
a.datas, | |
options, | |
exclude_binaries=False, | |
name='scrape', | |
debug=False, | |
strip=False, | |
upx=True, | |
console=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
code:'from COT import offline_settings as settings',I can not see it in COT library.please tell me how can I find it.Thanks