Skip to content

Instantly share code, notes, and snippets.

@rec
Created May 31, 2022 08:00
Show Gist options
  • Save rec/018d38eda47c9a84712f62f5f15ac589 to your computer and use it in GitHub Desktop.
Save rec/018d38eda47c9a84712f62f5f15ac589 to your computer and use it in GitHub Desktop.
from . import command
from engora import CONFIG
from pathlib import Path
from typer import Argument, Option
from typing import List, Optional
_CFGS = (
'max_items',
'max_requests',
'only',
'start',
'urls',
'write_links_to_journal',
)
@command(help='Run a spider')
def crawl(
name: str = Argument(
...,
help='Name of spider to run',
),
assignments: List[str] = Argument(
None,
help='Assign settings or configurations by using key=value',
),
batch_item_count: int = Option(
None, '--batch-item-count', '-b',
help='Count of items per feed file (0 means all in one file)',
),
concurrency: int = Option(
0, '--concurrency', '-c',
help='How many concurrent connections to run',
),
edit: bool = Option(
False, '--edit', '-e',
help='Open the stats file in the text editor',
),
limited: bool = Option(
False, '--limited', '-l',
help='Only perform a limited subset of queries (spider-dependent)',
),
max_items: Optional[int] = Option(
None, '--max-items',
help='Limit the number of items written',
),
max_requests: Optional[int] = Option(
None, '--max-requests', '-m',
help='Limit the number of http requests made',
),
only: Optional[List[str]] = Option(
None, '--only', '-o',
help='Restrict to only these URLs',
),
product_only: bool = Option(
False, '--product_only', '-p',
help='Only emit product records to output file',
),
rerun: Optional[bool] = Option(
False, '--rerun',
help='Rerun an existing job from where it left off',
),
start: Optional[str] = Option(
None, '--start', '-s',
help='Which URL to start the crawl from',
),
target: Optional[str] = Option(
None, '--target', '-t',
help='Name of existing directory to use',
),
urls: Optional[List[str]] = Option(
None, '--urls', '-u',
help='Crawl exactly these URLs',
),
use_csv: bool = Option(
False, '--use_csv',
help='Use csv format for output',
),
write_links_to_journal: bool = Option(
False, '--write-links-to-journal', '-w',
help='Write urls to the journal file',
),
work_dir: Optional[Path] = Option(
None, '--work-dir',
help='Name of existing directory to use',
),
):
d = dict(locals())
for c in _CFGS:
if (v := d.pop(c)) is not None:
setattr(CONFIG, 'crawl_' + c, v)
from engora.crawl import runner
runner(**d)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment