Last active
April 22, 2024 18:12
-
-
Save Irio/3da6ee4dea8cad6613c1337a15044f09 to your computer and use it in GitHub Desktop.
GCP Serverless scrapers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
eval "$(jq -r '@sh "PROJECT=\(.project) REGION=\(.region)"')" | |
while true | |
do | |
URL=$(gcloud beta run services describe wohnung \ | |
--platform managed \ | |
--project $PROJECT \ | |
--region $REGION \ | |
--format json | jq --raw-output '.status.url // empty') | |
if [ ! -z "$URL" ] | |
then | |
break | |
fi | |
sleep 5 | |
done | |
echo "{\"url\": \"$URL\"}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
variable "bucket" { | |
description = "Google Cloud Storage bucket name" | |
} | |
variable "project" { | |
description = "Google Cloud project ID" | |
} | |
variable "region" { | |
description = "Google Cloud region" | |
} | |
variable "cloud_source_repository" { | |
description = "Google Cloud Source repository name" | |
} | |
variable "zone" { | |
description = "Google Cloud zone, part of the provided region" | |
} | |
variable "crontab_schedule" { | |
description = "Crontab schedule for running scrapers" | |
} | |
locals { | |
container_tag = "gcr.io/${var.project}/wohnung:latest" | |
} | |
provider "google" { | |
credentials = "${file("credentials.json")}" | |
project = "${var.project}" | |
region = "${var.region}" | |
zone = "${var.zone}" | |
} | |
resource "google_storage_bucket" "items" { | |
name = "${var.bucket}" | |
location = "US" | |
} | |
resource "google_app_engine_application" "app" { | |
project = "${var.project}" | |
location_id = "${var.region}" | |
} | |
resource "google_cloud_scheduler_job" "job" { | |
name = "run-scrapers" | |
description = "Trigger scrapers" | |
schedule = "${var.crontab_schedule}" | |
time_zone = "Etc/UTC" | |
http_target { | |
http_method = "POST" | |
uri = "${data.external.google_cloud_run_service.result.url}/" | |
} | |
depends_on = ["google_app_engine_application.app"] | |
} | |
resource "google_cloudbuild_trigger" "default" { | |
trigger_template { | |
branch_name = "master" | |
repo_name = "${var.cloud_source_repository}" | |
} | |
substitutions = { | |
_BUCKET = "${var.bucket}" | |
} | |
build { | |
images = ["${local.container_tag}"] | |
step { | |
name = "gcr.io/cloud-builders/docker" | |
args = ["build", "-t", "${local.container_tag}", "."] | |
} | |
step { | |
name = "gcr.io/cloud-builders/docker" | |
args = ["push", "${local.container_tag}"] | |
} | |
step { | |
name = "gcr.io/cloud-builders/gcloud" | |
args = [ | |
"beta", "run", "deploy", "wohnung", | |
"--region", "${var.region}", | |
"--image", "${local.container_tag}", | |
"--update-env-vars", "GCLOUD_BUCKET=$${_BUCKET}", | |
"--memory", "1Gi", | |
"--timeout", "10m", | |
"--platform", "managed", | |
"--allow-unauthenticated", | |
] | |
} | |
} | |
provisioner "local-exec" { | |
command = "bash trigger_build.sh" | |
} | |
} | |
data "external" "google_cloud_run_service" { | |
depends_on = ["google_cloudbuild_trigger.default"] | |
program = ["bash", "get_service_url.sh"] | |
query = { | |
project = "${var.project}" | |
region = "${var.region}" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bucket = "realestate-berlin-254211-items" | |
cloud_source_repository = "github_irio_wohnung" | |
crontab_schedule = "44 8 * * *" | |
project = "realestate-berlin-254211" | |
region = "us-east1" | |
zone = "us-east1-b" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
TRIGGER_ID=$(gcloud alpha builds triggers list \ | |
--filter='triggerTemplate.repoName:github_irio_wohnung' \ | |
--format json | jq --raw-output '.[0].id') | |
gcloud alpha builds triggers run $TRIGGER_ID \ | |
--branch master > /dev/null |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM golang:1.12 as build | |
WORKDIR $GOPATH/src/github.com/Irio/wohnung | |
COPY scraper scraper | |
COPY main.go . | |
RUN go get -d -v ./... | |
RUN go install | |
FROM gcr.io/distroless/base | |
COPY --from=build /go/bin/wohnung / | |
CMD ["/wohnung"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"log" | |
"net/http" | |
"os" | |
scraper "github.com/Irio/wohnung/scraper" | |
) | |
func main() { | |
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { | |
err := r.ParseForm() | |
if err != nil { | |
log.Println(err) | |
} | |
if r.Method == http.MethodPost { | |
fmt.Fprintln(w, scraper.Run(r.Form)) | |
} | |
}) | |
port := os.Getenv("PORT") | |
if port == "" { | |
port = "8080" | |
} | |
log.Fatal(http.ListenAndServe(":"+port, nil)) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package cloudrun | |
import ( | |
"time" | |
"github.com/gocolly/colly" | |
) | |
type EBayKleinanzeigen struct{} | |
func (EBayKleinanzeigen) parseItem(e *colly.XMLElement) Item { | |
selector := "//*[contains(@class, 'ad-listitem')]//*[contains(@class, 'aditem-main')]//a" | |
title := e.ChildText(selector) | |
url := e.ChildAttr(selector, "href") | |
locationNodes := e.ChildTexts("//*[contains(@class, 'aditem-details')]//text()") | |
var location string | |
if len(locationNodes) > 8 { | |
location = locationNodes[6] + " " + locationNodes[8] | |
} | |
priceString := e.ChildText("//*[contains(@class, 'aditem-details')]//strong") | |
price, _ := parsePrice(priceString) | |
spaceString := e.ChildText("//*[contains(@class, 'text-module-end')]//*[contains(text(), 'm²')]") | |
livingSpace, _ := parseSpace(spaceString) | |
roomsString := e.ChildText("//*[contains(@class, 'text-module-end')]//*[contains(text(), 'Zimmer')]") | |
rooms, _ := parseFloat(roomsString, " Zimmer") | |
return Item{ | |
title: title, | |
location: location, | |
hasExactLocation: false, | |
price: price, | |
livingSpace: livingSpace, | |
rooms: rooms, | |
url: e.Request.AbsoluteURL(url), | |
scrapedAt: time.Now().UTC(), | |
} | |
} | |
func (platform EBayKleinanzeigen) NewCollector(config Config) *colly.Collector { | |
options := append( | |
config.collectorOptions, | |
colly.AllowedDomains("www.ebay-kleinanzeigen.de")) | |
return colly.NewCollector(options...) | |
} | |
func (platform EBayKleinanzeigen) crawl(config Config, exporter Exporter) *colly.Collector { | |
c := platform.NewCollector(config) | |
c.OnXML("//*[contains(@class, 'ad-listitem')]", func(e *colly.XMLElement) { | |
item := platform.parseItem(e) | |
exporter.write(item) | |
}) | |
c.OnXML("//a[contains(@class, 'pagination-next')]", func(e *colly.XMLElement) { | |
url := e.Request.AbsoluteURL(e.Attr("href")) | |
c.Visit(url) | |
}) | |
c.Visit("https://www.ebay-kleinanzeigen.de/s-wohnung-mieten/berlin/c203l3331") | |
return c | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package cloudrun | |
import ( | |
"encoding/csv" | |
"log" | |
"os" | |
"path" | |
"reflect" | |
"github.com/gocolly/colly" | |
) | |
type Exporter interface { | |
write(record Item) error | |
} | |
type CSVExporter struct { | |
writer *csv.Writer | |
fileName string | |
} | |
func (exp CSVExporter) write(record Item) error { | |
return exp.writer.Write(record.csvRow()) | |
} | |
func (CSVExporter) fields() []string { | |
val := reflect.ValueOf(&Item{}).Elem() | |
names := make([]string, val.NumField()) | |
for i := 0; i < val.NumField(); i++ { | |
names[i] = val.Type().Field(i).Name | |
} | |
return names | |
} | |
func (exp CSVExporter) run(config Config, fn func(Config, Exporter) *colly.Collector) { | |
os.MkdirAll(path.Dir(exp.fileName), 755) | |
file, err := os.Create(exp.fileName) | |
if err != nil { | |
log.Fatalf("Cannot create file %q: %s\n", exp.fileName, err) | |
return | |
} | |
defer file.Close() | |
exp.writer = csv.NewWriter(file) | |
defer exp.writer.Flush() | |
exp.writer.Write(exp.fields()) | |
collector := fn(config, exp) | |
log.Printf("Scraping finished, check file %q for results\n", exp.fileName) | |
log.Println(collector) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package cloudrun | |
import ( | |
"log" | |
"net/url" | |
"os" | |
"path" | |
"reflect" | |
"strconv" | |
"strings" | |
"time" | |
"github.com/gocolly/colly" | |
) | |
type Platform interface { | |
crawl(config Config, writer Exporter) *colly.Collector | |
} | |
type Item struct { | |
title string | |
location string | |
hasExactLocation bool | |
price int | |
livingSpace float64 | |
rooms float64 | |
url string | |
scrapedAt time.Time | |
} | |
type Config struct { | |
dataDir string | |
platforms []Platform | |
storage Storage | |
collectorOptions []colly.CollectorOption | |
} | |
func (record Item) csvRow() []string { | |
return []string{ | |
record.title, | |
record.location, | |
strconv.FormatBool(record.hasExactLocation), | |
strconv.Itoa(record.price), | |
strconv.FormatFloat(record.livingSpace, 'f', -1, 64), | |
strconv.FormatFloat(record.rooms, 'f', -1, 64), | |
record.url, | |
record.scrapedAt.Format(time.RFC3339), | |
} | |
} | |
func readConfig(params url.Values) Config { | |
available := map[string]Platform{ | |
"ebay_kleinanzeigen": EBayKleinanzeigen{}, | |
"immobilien_scout": ImmobilienScout{}, | |
"immowelt": Immowelt{}, | |
"nestpick": Nestpick{}, | |
} | |
platforms := make([]Platform, 0) | |
for name := range available { | |
platforms = append(platforms, available[name]) | |
} | |
cache := params.Get("cache") == "1" | |
var collectorOptions []colly.CollectorOption | |
if cache { | |
collectorOptions = append(collectorOptions, colly.CacheDir("cache")) | |
} | |
platform := params.Get("platform") | |
if platform != "" { | |
platforms = []Platform{available[platform]} | |
} | |
bucket, isDefined := os.LookupEnv("GCLOUD_BUCKET") | |
if !isDefined { | |
log.Fatalln("GCLOUD_BUCKET must be defined") | |
} | |
date := time.Now().UTC().Format(time.RFC3339) | |
storage := GCloudStorage{ | |
bucket: bucket, | |
destinationPath: date + "/", | |
} | |
return Config{ | |
dataDir: "/tmp/wohnung", | |
platforms: platforms, | |
storage: storage, | |
collectorOptions: collectorOptions, | |
} | |
} | |
func Run(params url.Values) string { | |
config := readConfig(params) | |
for _, platform := range config.platforms { | |
fileName := strings.Split(reflect.TypeOf(platform).String(), ".")[1] | |
fileName = path.Join(config.dataDir, fileName+".csv") | |
exporter := CSVExporter{fileName: fileName} | |
exporter.run(config, platform.crawl) | |
config.storage.write(fileName) | |
} | |
return "it works" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package cloudrun | |
import ( | |
"strconv" | |
"strings" | |
) | |
func parsePrice(valueStr string) (int, error) { | |
value, err := parseFloat(valueStr, " €") | |
return int(value * 100), err | |
} | |
func parseSpace(value string) (float64, error) { | |
return parseFloat(value, " m²") | |
} | |
func parseFloat(valueStr string, unit string) (float64, error) { | |
replacer := strings.NewReplacer(",", ".", ".", "", unit, "") | |
return strconv.ParseFloat(replacer.Replace(valueStr), 64) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package cloudrun | |
import ( | |
"context" | |
"io" | |
"log" | |
"os" | |
"path" | |
"path/filepath" | |
cloud_storage "cloud.google.com/go/storage" | |
) | |
type Storage interface { | |
write(filePath string) | |
} | |
type GCloudStorage struct { | |
bucket string | |
destinationPath string | |
} | |
func (storage GCloudStorage) write(source string) { | |
var r io.Reader | |
f, err := os.Open(source) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer f.Close() | |
r = f | |
ctx := context.Background() | |
if err = storage.upload(ctx, r, source); err != nil { | |
log.Fatal(err) | |
} | |
} | |
func (storage GCloudStorage) upload(ctx context.Context, r io.Reader, source string) error { | |
client, err := cloud_storage.NewClient(ctx) | |
if err != nil { | |
return err | |
} | |
bh := client.Bucket(storage.bucket) | |
name := path.Join(storage.destinationPath, filepath.Base(source)) | |
obj := bh.Object(name) | |
w := obj.NewWriter(ctx) | |
if _, err := io.Copy(w, r); err != nil { | |
return err | |
} | |
if err := w.Close(); err != nil { | |
return err | |
} | |
return nil | |
} |
interesting choice of tech used. nice work :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome work. I'm not understanding exactly what it's doing, but it looks very nice.