Last active
July 7, 2017 19:59
-
-
Save beporter/49410eee41c926b90d19313871851595 to your computer and use it in GitHub Desktop.
(Crudely) collect Amazon ASINs from URLs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Quick script to extract ASINs from Amazon URLs. | |
* | |
* @see http://stackoverflow.com/a/12827734/70876 | |
* | |
* beporter at users dot sourceforge dot net | |
* 2016-05-18 | |
*/ | |
//-------------------------------------- | |
function findAsins($i) { | |
return array_filter( | |
explode('/', parse_url($i, PHP_URL_PATH)), | |
function ($s) { return preg_match('/^(B\d{2}\w{7}|\d{9}(X|\d))$/', $s); } | |
); | |
} | |
//-------------------------------------- | |
// main() | |
$inputs = [ | |
'http://www.amazon.com/gp/product/B00H00OG7G/ref=s9_simh_gw_g23_i2_r?ie=UTF8&fpl=fresh&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=1T4A8079XNP9A8ZH73RT&pf_rd_t=36701&pf_rd_p=5d23eaf6-6278-49c1-b6df-7de0cb9b3a26&pf_rd_i=desktop', | |
'/Rust-Oleum-260357-Touch-Polyurethane-Matte/dp/B00714ZO22/more-shit/so-much/shit/', | |
'Touch-Polyurethane-Matte/dp/B00714ZO22', | |
'www.amazon.cn/电脑-it-办公/dp/B00YARCGHO/', | |
'/B00YCCCCCC/ALSO10NUMS/?something-else', | |
'Polyurethane/dp/B00RZYBZ9Y?foo=bar', | |
]; | |
foreach ($inputs as $i) { | |
// array_pop() is us assuming the "last" of multiple matches in the URL is correct. | |
echo array_pop((findAsins($i))) . PHP_EOL; | |
} | |
/*-------------------------------------- | |
Output: | |
$ php asins.php | |
B00H00OG7G | |
B00714ZO22 | |
B00714ZO22 | |
B00YARCGHO | |
B00YCCCCCC | |
B00RZYBZ9Y | |
*/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env php | |
<?php | |
/** | |
* This script takes two arguments: A path to a CSV file and a column name | |
* from which to extract ASINs. It will return a quoted, comma separated | |
* string of all found ASINs, suitable for use in an SQL query's `IN (...)` | |
* clause. | |
* | |
* Example: | |
* $ ./asin-extract.php a_file.csv columnName | |
*/ | |
/** | |
* Returns an array of all path-separated ASINs matched in a provided URL. | |
* | |
* @param string $s A URL possibly containing one or more `/ASIN/` path segment. | |
* @return array All matched ASINs, in order they were found in the URL (left to right). | |
*/ | |
function findAsins($s) { | |
return array_filter( | |
explode('/', parse_url($s, PHP_URL_PATH)), | |
function ($s) { return preg_match('/^(B\d{2}\w{7}|\d{9}(X|\d))$/', $s); } | |
); | |
} | |
/** | |
* main() | |
*/ | |
if ($argc < 3) { | |
die('Must supply CSV path as first argument and column name containing URLs with ASINs as second argument.'); | |
} | |
$path = $argv[1]; | |
$column = $argv[2]; | |
if (!is_readable($path)) { | |
die('Supplied CSV path is not readable.'); | |
} | |
ini_set('auto_detect_line_endings', true); | |
$csv = new \SplFileObject($path); | |
$csv->setFlags(SplFileObject::READ_CSV); | |
$headers = $csv->current(); | |
if (!in_array($column, $headers)) { | |
die(sprintf('First row of CSV file does not contain a column named `%s`.', $column)); | |
} | |
$columnPosition = array_search($column, $headers); | |
$foundAsins = []; | |
foreach ($csv as $row) { | |
$foundAsins = array_merge($foundAsins, findAsins($row[$columnPosition])); | |
} | |
foreach (array_chunk($foundAsins, 8) as $chunk) { | |
echo '"' . implode('","', $chunk) . '"' . PHP_EOL; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment