Last active
April 18, 2022 21:14
-
-
Save soypat/3728d463adeafe282dc9ef757cc59f2f to your computer and use it in GitHub Desktop.
Get SYC component data using a web scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"log" | |
"strconv" | |
"strings" | |
wd "github.com/fedesog/webdriver" | |
) | |
const ( | |
sycHost = "www.sycelectronica.com.ar" | |
mlHost = "articulo.mercadolibre.com.ar" | |
) | |
type article struct { | |
Title string | |
Category string | |
SKU string | |
USD float64 | |
// Tax as a percent. i.e 21.5 is 21.5% | |
TaxPercent float64 | |
Availability string | |
Image string | |
URL string | |
} | |
func (a article) excelify(quantity int) string { | |
return fmt.Sprintf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s", | |
quantity, a.Title, a.URL, a.SKU, a.usdString(), a.ivaString(), a.Category, a.Availability) | |
} | |
func (a article) usdString() string { | |
return strings.Replace(fmt.Sprintf("%.3f", a.USD), ".", Decimal, 1) | |
} | |
func (a article) ivaString() string { | |
return strings.Replace(fmt.Sprintf("%.1f", a.TaxPercent), ".", Decimal, 1) | |
} | |
func getMLArticle(s *wd.Session) (a article) { | |
pesoPrice := queryText(s, `span.andes-money-amount__fraction`) | |
pesos, err := strconv.ParseFloat(strings.ReplaceAll(pesoPrice, ",", "."), 64) | |
if err != nil { | |
log.Println("could not parse ML price: ", pesoPrice) | |
return a | |
} | |
a.USD = pesos / USD2Pesos | |
a.Title = queryText(s, `h1.ui-pdp-title`) | |
a.Availability = queryText(s, `span.ui-pdp-buybox__quantity__available`) | |
a.Category = queryText(s, `li > a.andes-breadcrumb__link`) | |
a.SKU = getMLSKU(s) | |
a.URL, _ = s.GetUrl() | |
return a | |
} | |
func getMLSKU(s *wd.Session) string { | |
el, err := query(s, `div.ui-pdp-bookmark__link-bookmark > form`) | |
if err != nil { | |
log.Println("unable to find MLSKU element") | |
return "" | |
} | |
v, err := el.GetAttribute("action") | |
if err != nil { | |
log.Println("could not find attribute for MLSKU element") | |
return "" | |
} | |
a := strings.Split(v, "/") | |
if len(a) < 4 { | |
log.Println("MLSKU attribute could not be parsed") | |
return "" | |
} | |
return a[3] | |
} | |
func getSYCArticle(s *wd.Session) (a article) { | |
_, err := query(s, `div.product_description`) | |
if err != nil { | |
return a | |
} | |
a.Category = queryText(s, `div.product_category`) | |
a.SKU = queryText(s, `div.product_name`) | |
a.Title = queryText(s, `div.product_text`) | |
a.Availability = queryText(s, `div[property=availability] > div`) | |
price := queryText(s, `div.product_price`) | |
nums := ParseNumbers(strings.ReplaceAll(price, ",", ".")) | |
if len(nums) < 2 || nums[0] <= 0 { | |
log.Printf("got bad price line %q. nums:%g\n", price, nums) | |
} else { | |
a.USD = nums[0] | |
a.TaxPercent = nums[1] | |
} | |
img, err := query(s, `div.image_selected > a > img`) | |
if err == nil { | |
a.Image, _ = img.GetAttribute("src") | |
} | |
a.URL, _ = s.GetUrl() | |
return a | |
} | |
// ParseNumbers reads all numbers from a string and returns | |
// them in the order they are found as floats. | |
func ParseNumbers(s string) (nums []float64) { | |
start := -1 | |
for i, c := range s { | |
isNum := isNumRune(c) | |
if (isNum || c == '.') && start < 0 { | |
// start of a number found. | |
start = i | |
if i > 0 && (s[i-1] == '-' || s[i-1] == '.') { | |
// back up one place if number negative or if number is decimal. | |
start-- | |
} | |
} | |
isTok := isFloatRune(c) | |
if (start >= 0 && !isTok) || (i == len(s)-1 && start >= 0) { | |
if isNum { | |
// Include number if at end of string. | |
i++ | |
} | |
num, err := strconv.ParseFloat(s[start:i], 64) | |
if err == nil { | |
nums = append(nums, num) | |
} | |
start = -1 // reset start to begin looking for new number. | |
} | |
} | |
return nums | |
} | |
func isNumRune(r rune) bool { return r^'0' < 10 } | |
func isFloatRune(r rune) bool { | |
return isNumRune(r) || r == '.' || r == 'E' || r == '+' || r == '-' || r == 'e' | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Install chromedriver at https://chromedriver.chromium.org/home | |
package main | |
import ( | |
"bufio" | |
"log" | |
"net/url" | |
"os" | |
"strconv" | |
"strings" | |
wd "github.com/fedesog/webdriver" | |
) | |
const ( | |
platform = "Linux" | |
urlStart = "https://www.sycelectronica.com.ar/" | |
// chromedriver required. Get it at https://chromedriver.chromium.org/home | |
driverPath = "/home/pato/local/bin/chromedriver" | |
Decimal = "," | |
USD2Pesos = 200.0 | |
) | |
func main() { | |
driver, err := openBrowser() | |
if err != nil { | |
log.Fatal(err) | |
} | |
fp, err := os.OpenFile("articles.tsv", os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644) | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer fp.Close() | |
var quantity int | |
scanner := bufio.NewScanner(os.Stdin) | |
log.Println("Navigate to article and enter quantity of articles desired:") | |
for scanner.Scan() { | |
txt := scanner.Text() | |
quantity, err = strconv.Atoi(txt) | |
if err != nil { | |
if err := scanner.Err(); err != nil { | |
log.Fatal(err) | |
} else { | |
log.Println("invalid input. must be integer") | |
} | |
continue | |
} | |
var art article | |
host := getHost(driver) | |
switch host { | |
case sycHost: | |
art = getSYCArticle(driver) | |
case mlHost: | |
art = getMLArticle(driver) | |
default: | |
log.Println("unknown host or got error at ", host) | |
continue | |
} | |
if art.SKU == "" { | |
log.Println("could not get article SKU. Are you on a article page?") | |
continue | |
} | |
log.Println("writing", quantity, art.Title) | |
_, err = fp.WriteString(art.excelify(quantity) + "\n") | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
} | |
func openBrowser() (*wd.Session, error) { | |
//driverPath, _ := studentCmd.PersistentFlags().GetString("driver") | |
chromeDriver := wd.NewChromeDriver(driverPath) | |
err := chromeDriver.Start() | |
if err != nil { | |
return &wd.Session{}, err | |
} | |
var session *wd.Session | |
desired := wd.Capabilities{"Platform": platform} | |
required := wd.Capabilities{"Platform": platform} | |
session, err = chromeDriver.NewSession(desired, required) | |
if err != nil { | |
return session, err | |
} | |
err = session.Url(urlStart) | |
if err != nil { | |
log.Fatal(err) | |
} | |
return session, nil | |
} | |
func queryText(s *wd.Session, querySelector string) string { | |
el, err := query(s, querySelector) | |
if err != nil { | |
log.Println("queryText failed:", err) | |
return "" | |
} | |
str, _ := el.Text() | |
return strings.ReplaceAll(str, "\t", "") // eliminate tabular separators | |
} | |
func querys(s *wd.Session, querySelector string) ([]wd.WebElement, error) { | |
return s.FindElements(wd.FindElementStrategy("css selector"), querySelector) | |
} | |
func query(s *wd.Session, querySelector string) (wd.WebElement, error) { | |
return s.FindElement(wd.FindElementStrategy("css selector"), querySelector) | |
} | |
func attrQuery(s *wd.Session, attrName, querySelector string) string { | |
e, err := query(s, querySelector) | |
if err != nil { | |
return "" | |
} | |
attribute, err := e.GetAttribute(attrName) | |
if err != nil { | |
return "" | |
} | |
return attribute | |
} | |
func mouseClickSelector(s *wd.Session, querySelector string) (*wd.Session, error) { | |
var m wd.MouseButton | |
elem, err := query(s, querySelector) // button selector | |
if err != nil { | |
return s, err | |
} | |
err = s.MoveTo(elem, 0, 0) | |
if err != nil { | |
return s, err | |
} | |
return s, s.Click(m) | |
} | |
func getHost(s *wd.Session) string { | |
a, _ := s.GetUrl() | |
URL, err := url.Parse(a) | |
if err != nil { | |
return "" | |
} | |
return URL.Host | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment