wsdookadr · September 26, 2023 00:57
diff --git a/README.md b/README.md
diff --git a/config.yaml b/config.yaml
 seeds:
  - url: http://web/
    sitemap: "http://web/sitemap_custom.xml"
    depth: 1
    blockRules:
      - url: googleanalytics.com
      - url: www.googletagmanager.com
      - url: googletagmanager.com

 combineWARC: true
diff --git a/docker-compose.yml b/docker-compose.yml
 version: '3.1'

 services:
  web:
    image: nginx:1.25.2
    ports:
      - "80:80"
    healthcheck:
      test: ["CMD", "curl", "-I", "http://localhost"]
      interval: 1s
      start_period: 3s
    volumes:
      - ./html:/usr/share/nginx/html:ro
      - ./nginx.default.conf:/etc/nginx/conf.d/default.conf:ro
    shm_size: 1gb
    privileged: true

  crawler:
    image: webrecorder/browsertrix-crawler:0.11.1
    depends_on:
      web:
        condition: service_healthy
    environment:
      - DISPLAY=:0.0
    volumes:
      - ./crawls:/crawls/
      - ./config.yaml:/app/crawl-config.yaml
      - /var/run/dbus:/var/run/dbus
      - /run/dbus:/run/dbus
      - /tmp/.X11-unix:/tmp/.X11-unix
      - /home/user/.Xauthority:/root/.Xauthority

    cap_add:
      - NET_ADMIN
      - SYS_ADMIN

    shm_size: 3gb
    privileged: true

    command: "crawl --behaviors autoscroll,autofetch --behaviorTimeout 8 --pageLoadTimeout 10 --workers 30 --headless --config /app/crawl-config.yaml"

diff --git a/fix_sitemap.sh b/fix_sitemap.sh
 #!/bin/bash

 cat <<EOF > html/sitemap_custom.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 EOF

 cat html/sitemap.xml | grep "<loc>" | grep "/guide/en" | \
 perl -ne '
 ($url)=$_=~m{<loc>(https?://www.elastic.co/[^<>]*?)</loc>};

 $url =~ s{https://www.elastic.co/guide}{http://web}g;

 print "
 <url>
    <loc>$url</loc>
    <lastmod>2023-07-10T16:05:35+00:00</lastmod>
    <changefreq>weekly</changefreq>
    <priority>0.5</priority>
 </url>
 ";

 END{ print "</urlset>\n"; }
 ' >> html/sitemap_custom.xml
diff --git a/monitor.sh b/monitor.sh
 #!/bin/bash

 LATEST=$(find crawls/ -name "crawl*.log" -printf "%T@ %p\n" | sort -n | tail -1 | awk '{print $2}')
 tail -f $LATEST | \
 jq -c -r 'select(.context | contains("crawlStatus")) | .details | del(.pendingPages)'
diff --git a/nginx.default.conf b/nginx.default.conf
 server {
    listen       80;
    listen  [::]:80;
    server_name  localhost;
    location ~ ^/guide/static {
 	rewrite ^/guide/(static.*)$ /$1 last;
    }
    location ~ ^/static-res {
 	rewrite ^/static-res/styles/font-files/(.*)$ /static/$1 last;
    }
    location / {
        root   /usr/share/nginx/html;
        index  index.html index.htm;
    }
    error_page   500 502 503 504  /50x.html;
    location = /50x.html {
        root   /usr/share/nginx/html;
    }
 }

diff --git a/start.sh b/start.sh
 #!/bin/bash
 # rm -rf crawls/*
 ./fix_sitemap.sh
 docker-compose rm --force
 time docker-compose up --exit-code-from crawler
	seeds:
	- url: http://web/
	sitemap: "http://web/sitemap_custom.xml"
	depth: 1
	blockRules:
	- url: googleanalytics.com
	- url: www.googletagmanager.com
	- url: googletagmanager.com

	combineWARC: true
	version: '3.1'

	services:
	web:
	image: nginx:1.25.2
	ports:
	- "80:80"
	healthcheck:
	test: ["CMD", "curl", "-I", "http://localhost"]
	interval: 1s
	start_period: 3s
	volumes:
	- ./html:/usr/share/nginx/html:ro
	- ./nginx.default.conf:/etc/nginx/conf.d/default.conf:ro
	shm_size: 1gb
	privileged: true

	crawler:
	image: webrecorder/browsertrix-crawler:0.11.1
	depends_on:
	web:
	condition: service_healthy
	environment:
	- DISPLAY=:0.0
	volumes:
	- ./crawls:/crawls/
	- ./config.yaml:/app/crawl-config.yaml
	- /var/run/dbus:/var/run/dbus
	- /run/dbus:/run/dbus
	- /tmp/.X11-unix:/tmp/.X11-unix
	- /home/user/.Xauthority:/root/.Xauthority

	cap_add:
	- NET_ADMIN
	- SYS_ADMIN

	shm_size: 3gb
	privileged: true

	command: "crawl --behaviors autoscroll,autofetch --behaviorTimeout 8 --pageLoadTimeout 10 --workers 30 --headless --config /app/crawl-config.yaml"
	#!/bin/bash

	cat <<EOF > html/sitemap_custom.xml
	<?xml version="1.0" encoding="UTF-8"?>
	<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	EOF

	cat html/sitemap.xml \| grep "<loc>" \| grep "/guide/en" \| \
	perl -ne '
	($url)=$_=~m{<loc>(https?://www.elastic.co/[^<>]*?)</loc>};

	$url =~ s{https://www.elastic.co/guide}{http://web}g;

	print "
	<url>
	<loc>$url</loc>
	<lastmod>2023-07-10T16:05:35+00:00</lastmod>
	<changefreq>weekly</changefreq>
	<priority>0.5</priority>
	</url>
	";

	END{ print "</urlset>\n"; }
	' >> html/sitemap_custom.xml
	#!/bin/bash

	LATEST=$(find crawls/ -name "crawl*.log" -printf "%T@ %p\n" \| sort -n \| tail -1 \| awk '{print $2}')
	tail -f $LATEST \| \
	jq -c -r 'select(.context \| contains("crawlStatus")) \| .details \| del(.pendingPages)'
	server {
	listen 80;
	listen [::]:80;
	server_name localhost;
	location ~ ^/guide/static {
	rewrite ^/guide/(static.*)$ /$1 last;
	}
	location ~ ^/static-res {
	rewrite ^/static-res/styles/font-files/(.*)$ /static/$1 last;
	}
	location / {
	root /usr/share/nginx/html;
	index index.html index.htm;
	}
	error_page 500 502 503 504 /50x.html;
	location = /50x.html {
	root /usr/share/nginx/html;
	}
	}
	#!/bin/bash
	# rm -rf crawls/*
	./fix_sitemap.sh
	docker-compose rm --force
	time docker-compose up --exit-code-from crawler