Created
October 17, 2024 18:35
-
-
Save josuebrunel/d32a3babf04cac9e2d1317f193fa7791 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"log/slog" | |
"sync" | |
"github.com/PuerkitoBio/goquery" | |
"github.com/geziyor/geziyor" | |
"github.com/geziyor/geziyor/client" | |
"github.com/geziyor/geziyor/export" | |
) | |
const ( | |
URLQuotes = "quotes" | |
URLTurtle = "turtle" | |
URLAliExpress = "aliexpress" | |
) | |
// AutherNText represents a quote with its author and text. | |
type AutherNText struct { | |
Author string `json:"author"` | |
Text string `json:"text"` | |
} | |
// AliExpressProduct represents a product from AliExpress with its name and price. | |
type AliExpressProduct struct { | |
Name string `json:"name"` | |
Price string `json:"price"` | |
} | |
// Turtle represents a turtle species with its name, description, and image URL. | |
type Turtle struct { | |
Name string `json:"name"` | |
Description string `json:"description"` | |
Image string `json:"image"` | |
} | |
// DataExporter is a generic struct that can export data of any type. | |
type DataExporter[T any] struct { | |
Data []T | |
} | |
// Export adds scraped data to the DataExporter's Data slice. | |
// It receives data through a channel and appends it to the Data slice. | |
func (de *DataExporter[T]) Export(scraped chan any) error { | |
if len(de.Data) == 0 { | |
de.Data = []T{} | |
} | |
for pd := range scraped { | |
d := pd.(T) | |
de.Data = append(de.Data, d) | |
} | |
return nil | |
} | |
func main() { | |
var ( | |
exporter = DataExporter[AutherNText]{} | |
urls = map[string][]string{ | |
URLQuotes: { | |
"http://quotes.toscrape.com/", | |
}, | |
URLTurtle: { | |
"https://www.scrapethissite.com/pages/frames/?frame=i&family=Cheloniidae", | |
"https://www.scrapethissite.com/pages/frames/?frame=i&family=Chelydridae", | |
"https://www.scrapethissite.com/pages/frames/?frame=i&family=Carettochelyidae", | |
}, | |
URLAliExpress: { | |
"https://www.aliexpress.com/item/1005006959851087.html", | |
"https://www.aliexpress.com/item/1005007265735821.html", | |
}, | |
} | |
) | |
// Scrape quotes | |
slog.Info("Processing quotes") | |
g := geziyor.NewGeziyor(&geziyor.Options{ | |
StartURLs: urls[URLQuotes], | |
ParseFunc: quotesParse, | |
// Exporters: []export.Exporter{&export.JSON{}, &export.CSV{}} | |
Exporters: []export.Exporter{&exporter}, | |
}) | |
g.Start() | |
// slog.Info("exportor content", "data", exporter) | |
// Scrape turtles | |
slog.Info("Processing turtles concurrently") | |
turtleExporter := DataExporter[Turtle]{} | |
g = geziyor.NewGeziyor(&geziyor.Options{ | |
StartURLs: urls[URLTurtle], | |
ParseFunc: turtleScraper, | |
Exporters: []export.Exporter{&turtleExporter}, | |
}) | |
g.Start() | |
slog.Info("turtle exportor content", "data", turtleExporter.Data) | |
// Scrape AliExpress products | |
slog.Info("Processing aliexpress concurrently but manually") | |
aliExport := DataExporter[AliExpressProduct]{} | |
g = geziyor.NewGeziyor(&geziyor.Options{ | |
StartRequestsFunc: func(g *geziyor.Geziyor) { | |
var wg sync.WaitGroup | |
for _, url := range urls[URLAliExpress] { | |
wg.Add(1) | |
go func() { | |
slog.Info("processing product", "url", url) | |
g.GetRendered(url, aliexpressProduct) | |
wg.Done() | |
}() | |
} | |
wg.Wait() | |
}, | |
Exporters: []export.Exporter{&aliExport}, | |
BrowserEndpoint: "ws://127.0.0.1:9222", | |
}) | |
g.Start() | |
slog.Info("aliexpress exportor content", "data", aliExport.Data) | |
} | |
// quotesParse is a parsing function for quote pages. | |
// It extracts author and text information from each quote on the page, | |
// exports the data as AutherNText structs, and follows pagination links. | |
func quotesParse(g *geziyor.Geziyor, r *client.Response) { | |
// Find all quote elements and process each one | |
r.HTMLDoc.Find("div.quote").Each(func(i int, s *goquery.Selection) { | |
// Extract author and text from the quote | |
g.Exports <- AutherNText{ | |
Author: s.Find("small.author").Text(), | |
Text: s.Find("span.text").Text(), | |
} | |
}) | |
// Check for a "next" pagination link | |
if href, ok := r.HTMLDoc.Find("li.next > a").Attr("href"); ok { | |
// If found, follow the link and continue parsing | |
g.Get(r.JoinURL(href), quotesParse) | |
} | |
} | |
// turtleScraper is a parsing function for turtle family pages. | |
// It extracts the turtle's name, description, and image URL from the HTML | |
// and exports the data as a Turtle struct. | |
func turtleScraper(g *geziyor.Geziyor, r *client.Response) { | |
g.Exports <- Turtle{ | |
Name: r.HTMLDoc.Find(".turtle-family-detail > h3").Text(), | |
Description: r.HTMLDoc.Find(".turtle-family-detail > p").Text(), | |
Image: r.HTMLDoc.Find(".turtle-family-detail > img").AttrOr("src", ""), | |
} | |
} | |
// aliexpressProduct is a parsing function for AliExpress product pages. | |
// It extracts the product title and price from the rendered HTML and exports | |
// the data as an AliExpressProduct struct. | |
// Note: This function uses JavaScript rendering to access dynamic content. | |
func aliexpressProduct(g *geziyor.Geziyor, r *client.Response) { | |
// Extract the product title from the h1 element with data-pl="product-title" | |
title := r.HTMLDoc.Find("h1[data-pl=product-title]").Text() | |
// Extract the product price from the span element with class "product-price-value" | |
price := r.HTMLDoc.Find("span.product-price-value").Text() | |
// Export the extracted data as an AliExpressProduct struct | |
g.Exports <- AliExpressProduct{Name: title, Price: price} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment