Last active
April 11, 2025 20:55
-
-
Save tokudu/da376453b4197789bbe37e2596da1e4d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Database, Json } from '@/types/supabase-schema.generated' | |
import { createClient } from '@supabase/supabase-js' | |
import chalk from 'chalk' | |
import pino from 'pino' | |
import PinoPretty from 'pino-pretty' | |
const logger = pino({ | |
transport: { | |
target: 'pino-pretty', | |
options: { | |
colorize: true, | |
messageFormat: '{msg}', | |
ignore: 'pid,hostname', | |
} satisfies PinoPretty.PrettyOptions, | |
}, | |
level: 'debug', | |
}) | |
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion | |
const supabaseUrl = process.env.SUPABASE_URL! | |
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion | |
const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY! | |
const supabaseClient = createClient<Database>(supabaseUrl, supabaseKey) | |
// CREATE TYPE "public"."platform" AS enum('ios', 'android', 'macos'); | |
// CREATE TABLE reviews ( | |
// id UUID NOT NULL DEFAULT gen_random_uuid (), | |
// platform "public"."platform" NOT NULL, | |
// locale text NOT NULL, | |
// store_id text NOT NULL, | |
// store_url text, | |
// author_name text NOT NULL, | |
// author_image_url text, | |
// rating int NOT NULL, | |
// thumbs_up int NOT NULL, | |
// title text, | |
// text text NOT NULL, | |
// app_version text, | |
// reply_text text, | |
// raw_data jsonb NOT NULL, | |
// replied_at TIMESTAMP WITH TIME ZONE, | |
// created_at TIMESTAMP WITH TIME ZONE NOT NULL, | |
// updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now(), | |
// CONSTRAINT reviews_store_id_ukey UNIQUE (store_id) | |
// ); | |
// CREATE INDEX "reviews_platform_idx" ON "public"."reviews" USING "btree" ("platform"); | |
// CREATE INDEX "reviews_created_at_idx" ON "public"."reviews" USING "btree" ("created_at"); | |
// CREATE INDEX "reviews_updated_at_idx" ON "public"."reviews" USING "btree" ("updated_at"); | |
// CREATE INDEX "reviews_store_id_idx" ON "public"."reviews" USING "btree" ("store_id"); | |
// CREATE INDEX "reviews_app_version_idx" ON "public"."reviews" USING "btree" ("app_version"); | |
type Review = Database['public']['Tables']['reviews']['Insert'] | |
async function scrapePlayStoreReviews({ | |
packageName, | |
locale = 'en', | |
sort = 2, | |
pageSize = 20, | |
throttleMs = 2000, | |
}: { | |
packageName: string | |
locale?: string | |
country?: string | |
sort?: 2 | 3 | 1 // 2 - newest, 3 - rating, 1 - helpfulness | |
pageSize?: number | |
throttleMs?: number | |
}): Promise<Review[]> { | |
const reviews: Review[] = [] | |
let nextPageToken: string | undefined | |
while (nextPageToken !== undefined || reviews.length === 0) { | |
/* Sample CURL command: | |
curl -s 'https://play.google.com/_/PlayStoreUi/data/batchexecute?rpcids=qnKhOb&f.sid=-697906427155521722&bl=boq_playuiserver_20190903.08_p0&hl=en&gl=us&authuser&soc-app=121&soc-platform=1&soc-device=1&_reqid=1065213' -X POST -H 'content-type: application/x-www-form-urlencoded;charset=UTF-8' -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' --data-raw 'f.req=%5B%5B%5B%22UsvDTd%22%2C%22%5Bnull%2Cnull%2C%5B2%2C2%2C%5B100%2Cnull%2Cnull%5D%2Cnull%2C%5B%5D%5D%2C%5B%5C%22co.justimagine.rosebud%5C%22%2C7%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D' --compressed | cut -c6- | grep -o '\[\[\[.*\]\]\]' | sed 's/\\"/"/g' | sed 's/\\\\/\\/g' | jq -r '.[0][] | "[\(.[2])★] \(.[1][0]): \(.[4])"' | sort -r | |
*/ | |
const url = new URL('https://play.google.com/_/PlayStoreUi/data/batchexecute') | |
url.searchParams.set('hl', locale) | |
url.searchParams.set('gl', 'us') | |
url.searchParams.set('authuser', '') | |
url.searchParams.set('rpcids', 'qnKhOb') | |
url.searchParams.set('f.sid', '-697906427155521722') | |
url.searchParams.set('bl', 'boq_playuiserver_20190903.08_p0') | |
url.searchParams.set('soc-app', '121') | |
url.searchParams.set('soc-platform', '1') | |
url.searchParams.set('soc-device', '1') | |
url.searchParams.set('_reqid', '1065213') | |
logger.info( | |
`Fetching next page of reviews (${chalk.bold(`pageSize=${pageSize}`)}) via ${chalk.bold(url.toString())} ...`, | |
) | |
const response = await fetch(url.toString(), { | |
method: 'POST', | |
headers: { | |
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8', | |
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)', | |
}, | |
body: nextPageToken | |
? `f.req=%5B%5B%5B%22UsvDTd%22%2C%22%5Bnull%2Cnull%2C%5B2%2C${sort}%2C%5B${pageSize}%2Cnull%2C%5C%22${nextPageToken}%5C%22%5D%2Cnull%2C%5B%5D%5D%2C%5B%5C%22${packageName}%5C%22%2C7%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D` | |
: `f.req=%5B%5B%5B%22UsvDTd%22%2C%22%5Bnull%2Cnull%2C%5B2%2C${sort}%2C%5B${pageSize}%2Cnull%2Cnull%5D%2Cnull%2C%5B%5D%5D%2C%5B%5C%22${packageName}%5C%22%2C7%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D`, | |
}) | |
if (!response.ok) { | |
const errorText = await response.text() | |
throw new Error(`HTTP error! Status: ${response.status}, Text: ${errorText}`) | |
} | |
const responseText = await response.text() | |
// Parse the response to get the reviews and the next page token | |
const responseData = JSON.parse(JSON.parse(responseText.substring(5))[0][2] || `[]`) | |
if (responseData.length) { | |
reviews.push( | |
...responseData[0].map( | |
// eslint-disable-next-line @typescript-eslint/no-explicit-any | |
(data: any) => { | |
const review = { | |
raw_data: data, | |
locale, | |
platform: 'android', | |
store_id: data[0], | |
store_url: `https://play.google.com/store/apps/details?id=${packageName}&reviewId=${data[0]}`, | |
author_name: data[1][0], | |
author_image_url: data[1][1] ? data[1][1][3][2] : null, | |
rating: data[2], | |
title: data[3] || null, | |
text: data[4], | |
created_at: new Date(Number(`${data[5][0]}${`${data[5][1] || '000'}`.substring(0, 3)}`)).toISOString(), | |
app_version: data[10] || null, | |
thumbs_up: data[6], | |
...(data[7] | |
? { | |
reply_text: data[7][1], | |
replied_at: new Date( | |
Number(`${data[7][2][0]}${`${data[7][2][1] || '000'}`.substring(0, 3)}`), | |
).toISOString(), | |
} | |
: {}), | |
} satisfies Review | |
return review | |
}, | |
), | |
) | |
} | |
// Check if we have a next page token and need to fetch more results | |
nextPageToken = responseData[1]?.[1] // Extract the next page token from the response | |
if (nextPageToken === undefined) { | |
break | |
} | |
// Wait for the specified throttle time before making the next request | |
logger.info(`Waiting for ${chalk.bold(`${throttleMs}ms`)} before fetching next page...`) | |
await new Promise((resolve) => setTimeout(resolve, throttleMs)) | |
} | |
return reviews | |
} | |
async function scrapeAppStoreReviews({ | |
appId, | |
locale = 'us', | |
page = 1, | |
pageSize = 50, | |
throttleMs = 2000, | |
}: { | |
appId: string | |
locale?: string | |
page?: number | |
pageSize?: number | |
throttleMs?: number | |
}): Promise<Review[]> { | |
const reviews: Review[] = [] | |
let currentPage = page | |
let hasMorePages = true | |
while (hasMorePages) { | |
const url = new URL( | |
`https://itunes.apple.com/${locale}/rss/customerreviews/page=${currentPage}/id=${appId}/sortby=mostrecent/json`, | |
) | |
logger.info( | |
`Fetching page ${chalk.bold(`${currentPage}`)} of reviews (${chalk.bold(`pageSize=${pageSize}`)}) via ${chalk.bold(url.toString())} ...`, | |
) | |
const response = await fetch(url.toString(), { | |
method: 'GET', | |
headers: { | |
'User-Agent': | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', | |
}, | |
}) | |
if (!response.ok) { | |
// If we get a 404, it likely means we've reached the end of available reviews | |
if (response.status === 404) { | |
hasMorePages = false | |
break | |
} | |
const errorText = await response.text() | |
throw new Error(`HTTP error! Status: ${response.status}, Text: ${errorText}`) | |
} | |
const responseData = (await response.json()) as { | |
feed?: { | |
entry?: | |
| { | |
id: { label: string } | |
'im:rating': { label: string } | |
updated?: { label: string } | |
'im:version'?: { label: string } | |
author: { | |
name: { label: string } | |
uri?: { label: string } | |
} | |
title?: { label: string } | |
content?: { label: string } | |
[key: string]: unknown | |
} | |
| { | |
id: { label: string } | |
'im:rating': { label: string } | |
updated?: { label: string } | |
'im:version'?: { label: string } | |
author: { | |
name: { label: string } | |
uri?: { label: string } | |
} | |
title?: { label: string } | |
content?: { label: string } | |
[key: string]: unknown | |
}[] | |
author?: unknown | |
updated?: unknown | |
rights?: unknown | |
title?: unknown | |
icon?: unknown | |
link?: unknown | |
id?: unknown | |
[key: string]: unknown | |
} | |
[key: string]: unknown | |
} | |
// Check if we have entries in the feed | |
const entries = responseData.feed?.entry || [] | |
// Filter out the first entry if it's the app info (not a review) | |
// Make sure entries is an array before calling filter | |
const reviewEntries = Array.isArray(entries) | |
? entries.filter((entry) => entry['im:rating']) | |
: entries && typeof entries === 'object' && 'im:rating' in entries | |
? [entries] // If it's a single object with im:rating, convert to array | |
: [] // Otherwise, empty array | |
if (reviewEntries.length === 0) { | |
hasMorePages = false | |
break | |
} | |
console.log(reviewEntries[0]) | |
reviews.push( | |
...reviewEntries.map( | |
(entry) => | |
({ | |
raw_data: JSON.parse(JSON.stringify(entry)) as Json, | |
platform: 'ios', | |
locale, | |
store_id: entry.id.label, | |
store_url: entry.id.label, | |
author_name: entry.author.name.label, | |
author_image_url: null, // App Store doesn't provide author images | |
rating: parseInt(entry['im:rating']?.label || '0', 10), | |
title: entry.title?.label || null, | |
text: entry.content?.label || '', | |
created_at: new Date(entry.updated?.label || new Date().toISOString()).toISOString(), | |
app_version: entry['im:version']?.label || null, | |
thumbs_up: 0, // App Store doesn't provide thumbs up count | |
}) satisfies Review, | |
), | |
) | |
// Increment page for next iteration | |
currentPage++ | |
// Check if we should continue fetching more pages | |
// Apple's RSS feed typically has a limit on how many pages we can fetch | |
if (currentPage > 10 || reviewEntries.length < pageSize) { | |
hasMorePages = false | |
} | |
// Wait for the specified throttle time before making the next request | |
if (hasMorePages) { | |
logger.info(`Waiting for ${chalk.bold(`${throttleMs}ms`)} before fetching next page...`) | |
await new Promise((resolve) => setTimeout(resolve, throttleMs)) | |
} | |
} | |
return reviews | |
} | |
function logReview(review: Review) { | |
const stars = `${chalk.yellow( | |
Array.from({ length: review.rating }) | |
.map(() => '★') | |
.join(''), | |
)}${Array.from({ length: 5 - review.rating }) | |
.map(() => '☆') | |
.join('')}` | |
logger.info( | |
`[${chalk.bold(review.platform)}] ${stars} ${chalk.green(review.author_name)}: ${chalk.magenta(`${review.text.substring(0, 50)}...`)}`, | |
) | |
} | |
async function run({ playStorePackageName, appStoreAppId }: { playStorePackageName: string; appStoreAppId: string }) { | |
const reviews: Review[] = [] | |
try { | |
// PLAY STORE | |
// prettier-ignore | |
const playStoreLocaleCodes = [ "af", "ak", "sq", "am", "ar", "hy", "az", "eu", "be", "bem", "bn", "bh", "xx-bork", "bs", "br", "bg", "km", "ca", "chr", "ny", "zh-cn", "zh-tw", "co", "hr", "cs", "da", "nl", "xx-elmer", "en", "eo", "et", "ee", "fo", "tl", "fi", "fr", "fy", "gaa", "gl", "ka", "de", "el", "kl", "gn", "gu", "xx-hacker", "ht", "ha", "haw", "iw", "hi", "hu", "is", "ig", "id", "ia", "ga", "it", "ja", "jw", "kn", "kk", "rw", "rn", "xx-klingon", "kg", "ko", "kri", "ku", "ckb", "ky", "lo", "la", "lv", "ln", "lt", "loz", "lg", "ach", "mk", "mg", "my", "ms", "ml", "mt", "mv", "mi", "mr", "mfe", "mo", "mn", "sr-me", "ne", "pcm", "nso", "no", "nn", "oc", "or", "om", "ps", "fa", "xx-pirate", "pl", "pt", "pt-br", "pt-pt", "pa", "qu", "ro", "rm", "nyn", "ru", "gd", "sr", "sh", "st", "tn", "crs", "sn","sd", "si", "sk", "sl", "so", "es", "es-419", "su", "sw", "sv", "tg", "ta", "tt", "te", "th", "ti", "to", "lua", "tum", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "vu", "vi", "cy", "wo", "xh", "yi", "yo", "zu"] as const | |
for (const locale of playStoreLocaleCodes) { | |
logger.info(chalk.cyan(`Fetching PlayStore reviews for locale ${chalk.cyan.bold(locale)}...`)) | |
reviews.push( | |
...(await scrapePlayStoreReviews({ packageName: playStorePackageName, locale })).filter((review) => { | |
if (reviews.some((r) => r.store_id === review.store_id)) return false | |
logReview(review) | |
return true | |
}), | |
) | |
} | |
// APP STORE | |
// prettier-ignore | |
const appStoreLocaleCodes = ["ae", "ag", "ai", "al", "am", "ao", "ar", "at", "au", "az", "bb", "be", "bf", "bg", "bh", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bw", "by", "bz", "ca", "cg", "ch", "cl", "cn", "co", "cr", "cv", "cy", "cz", "de", "dk", "dm", "do", "dz", "ec", "ee", "eg", "es", "fi", "fj", "fm", "fr", "gb", "gd", "gh", "gm", "gr", "gt", "gw", "gy", "hk", "hn", "hr", "hu", "id", "ie", "il", "in", "is", "it", "jm", "jo", "jp", "ke", "kg", "kh", "kn", "kr", "kw", "ky", "kz", "la", "lb", "lc", "lk", "lr", "lt", "lu", "lv", "md", "mg", "mk", "ml", "mn", "mo", "mr", "ms", "mt", "mu", "mw", "mx", "my", "mz", "na", "ne", "ng", "ni", "nl", "no", "np", "nz", "om", "pa", "pe", "pg", "ph", "pk", "pl", "pt", "pw", "py", "qa", "ro", "ru", "sa", "sb", "sc", "se", "sg", "si", "sk", "sl", "sn", "sr", "st", "sv", "sz", "tc", "td", "th", "tj", "tm", "tn", "tr", "tt", "tw", "tz", "ua", "ug", "us", "uy", "uz", "vc", "ve", "vg", "vn", "ye", "za", "zw"] as const | |
for (const locale of appStoreLocaleCodes) { | |
logger.info(chalk.cyan(`Fetching AppStore reviews for locale ${chalk.cyan.bold(locale)}...`)) | |
reviews.push( | |
...(await scrapeAppStoreReviews({ appId: appStoreAppId, locale })).filter((review) => { | |
if (reviews.some((r) => r.store_id === review.store_id)) return false | |
logReview(review) | |
return true | |
}), | |
) | |
} | |
logger.info(chalk.green(`Inserting reviews into database...`)) | |
for (const review of reviews) { | |
logReview(review) | |
} | |
const { error } = await supabaseClient.from('reviews').upsert(Object.values(reviews), { onConflict: 'store_id' }) | |
if (error) { | |
throw error | |
} | |
logger.info('') | |
logger.info(chalk.green.bold('Scrapping completed!')) | |
logger.info(`Android reviews: ${chalk.green.bold(reviews.filter((r) => r.platform === 'android').length)}`) | |
logger.info(`iOS reviews: ${chalk.green.bold(reviews.filter((r) => r.platform === 'ios').length)}`) | |
logger.info(`Total reviews: ${chalk.magenta.bold(reviews.length)}`) | |
} catch (err) { | |
logger.error(err, chalk.red.bold('Scrapping failed')) | |
process.exit(1) | |
} | |
} | |
run({ | |
playStorePackageName: 'com.pixelheart.wledplus', | |
appStoreAppId: '6474789652', | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment