Created
May 31, 2022 08:00
-
-
Save rec/018d38eda47c9a84712f62f5f15ac589 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from . import command | |
from engora import CONFIG | |
from pathlib import Path | |
from typer import Argument, Option | |
from typing import List, Optional | |
_CFGS = ( | |
'max_items', | |
'max_requests', | |
'only', | |
'start', | |
'urls', | |
'write_links_to_journal', | |
) | |
@command(help='Run a spider') | |
def crawl( | |
name: str = Argument( | |
..., | |
help='Name of spider to run', | |
), | |
assignments: List[str] = Argument( | |
None, | |
help='Assign settings or configurations by using key=value', | |
), | |
batch_item_count: int = Option( | |
None, '--batch-item-count', '-b', | |
help='Count of items per feed file (0 means all in one file)', | |
), | |
concurrency: int = Option( | |
0, '--concurrency', '-c', | |
help='How many concurrent connections to run', | |
), | |
edit: bool = Option( | |
False, '--edit', '-e', | |
help='Open the stats file in the text editor', | |
), | |
limited: bool = Option( | |
False, '--limited', '-l', | |
help='Only perform a limited subset of queries (spider-dependent)', | |
), | |
max_items: Optional[int] = Option( | |
None, '--max-items', | |
help='Limit the number of items written', | |
), | |
max_requests: Optional[int] = Option( | |
None, '--max-requests', '-m', | |
help='Limit the number of http requests made', | |
), | |
only: Optional[List[str]] = Option( | |
None, '--only', '-o', | |
help='Restrict to only these URLs', | |
), | |
product_only: bool = Option( | |
False, '--product_only', '-p', | |
help='Only emit product records to output file', | |
), | |
rerun: Optional[bool] = Option( | |
False, '--rerun', | |
help='Rerun an existing job from where it left off', | |
), | |
start: Optional[str] = Option( | |
None, '--start', '-s', | |
help='Which URL to start the crawl from', | |
), | |
target: Optional[str] = Option( | |
None, '--target', '-t', | |
help='Name of existing directory to use', | |
), | |
urls: Optional[List[str]] = Option( | |
None, '--urls', '-u', | |
help='Crawl exactly these URLs', | |
), | |
use_csv: bool = Option( | |
False, '--use_csv', | |
help='Use csv format for output', | |
), | |
write_links_to_journal: bool = Option( | |
False, '--write-links-to-journal', '-w', | |
help='Write urls to the journal file', | |
), | |
work_dir: Optional[Path] = Option( | |
None, '--work-dir', | |
help='Name of existing directory to use', | |
), | |
): | |
d = dict(locals()) | |
for c in _CFGS: | |
if (v := d.pop(c)) is not None: | |
setattr(CONFIG, 'crawl_' + c, v) | |
from engora.crawl import runner | |
runner(**d) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment