Created
November 15, 2016 16:57
-
-
Save d0ugal/38c55259da05dce56ccd9a3f72b91f9e to your computer and use it in GitHub Desktop.
Broken Mistral Crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Before starting this, you need to create the mistral-environment. | |
# $ cat ~/env.json | |
# {"name": "crawler", "variables": {"visited": []}} | |
# mistral environment-create env.json | |
--- | |
version: '2.0' | |
name: crawler | |
description: Web Scraping Workbook | |
workflows: | |
get_web_page: | |
description: > | |
Fetch a given URL. Store it. Extract all the links and queue them. | |
input: | |
- url | |
tasks: | |
limit_domain: | |
on-success: | |
- get_mistral_env: <% 'dougalmatthews.com' in $.url %> | |
get_mistral_env: | |
action: mistral.environments_get name='crawler' | |
on-success: | |
- mark_visited: <% not $.url in task(get_mistral_env).result.variables.visited %> | |
- download_url: <% not $.url in task(get_mistral_env).result.variables.visited %> | |
mark_visited: | |
action: mistral.environments_update name='crawler' variables=<% {visited => [$.url] + task(get_mistral_env).result.variables.visited} %> | |
download_url: | |
action: std.http url=<% $.url %> allow_redirects=True | |
on-success: extract_links | |
extract_links: | |
publish: | |
urls: <% regex('((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))').searchAll(task(download_url).result.content) %> | |
on-success: | |
- echo_links | |
- follow_links | |
echo_links: | |
action: std.echo output=<% task(extract_links).published.urls %> | |
follow_links: | |
with-items: suburl in <% task(extract_links).published.urls %> | |
workflow: get_web_page url=<% $.suburl %> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
maybe use env() to do this?