Skip to content

Instantly share code, notes, and snippets.

@tokudu
Last active April 11, 2025 20:55
Show Gist options
  • Save tokudu/da376453b4197789bbe37e2596da1e4d to your computer and use it in GitHub Desktop.
Save tokudu/da376453b4197789bbe37e2596da1e4d to your computer and use it in GitHub Desktop.
import { Database, Json } from '@/types/supabase-schema.generated'
import { createClient } from '@supabase/supabase-js'
import chalk from 'chalk'
import pino from 'pino'
import PinoPretty from 'pino-pretty'
const logger = pino({
transport: {
target: 'pino-pretty',
options: {
colorize: true,
messageFormat: '{msg}',
ignore: 'pid,hostname',
} satisfies PinoPretty.PrettyOptions,
},
level: 'debug',
})
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const supabaseUrl = process.env.SUPABASE_URL!
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY!
const supabaseClient = createClient<Database>(supabaseUrl, supabaseKey)
// CREATE TYPE "public"."platform" AS enum('ios', 'android', 'macos');
// CREATE TABLE reviews (
// id UUID NOT NULL DEFAULT gen_random_uuid (),
// platform "public"."platform" NOT NULL,
// locale text NOT NULL,
// store_id text NOT NULL,
// store_url text,
// author_name text NOT NULL,
// author_image_url text,
// rating int NOT NULL,
// thumbs_up int NOT NULL,
// title text,
// text text NOT NULL,
// app_version text,
// reply_text text,
// raw_data jsonb NOT NULL,
// replied_at TIMESTAMP WITH TIME ZONE,
// created_at TIMESTAMP WITH TIME ZONE NOT NULL,
// updated_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT now(),
// CONSTRAINT reviews_store_id_ukey UNIQUE (store_id)
// );
// CREATE INDEX "reviews_platform_idx" ON "public"."reviews" USING "btree" ("platform");
// CREATE INDEX "reviews_created_at_idx" ON "public"."reviews" USING "btree" ("created_at");
// CREATE INDEX "reviews_updated_at_idx" ON "public"."reviews" USING "btree" ("updated_at");
// CREATE INDEX "reviews_store_id_idx" ON "public"."reviews" USING "btree" ("store_id");
// CREATE INDEX "reviews_app_version_idx" ON "public"."reviews" USING "btree" ("app_version");
type Review = Database['public']['Tables']['reviews']['Insert']
async function scrapePlayStoreReviews({
packageName,
locale = 'en',
sort = 2,
pageSize = 20,
throttleMs = 2000,
}: {
packageName: string
locale?: string
country?: string
sort?: 2 | 3 | 1 // 2 - newest, 3 - rating, 1 - helpfulness
pageSize?: number
throttleMs?: number
}): Promise<Review[]> {
const reviews: Review[] = []
let nextPageToken: string | undefined
while (nextPageToken !== undefined || reviews.length === 0) {
/* Sample CURL command:
curl -s 'https://play.google.com/_/PlayStoreUi/data/batchexecute?rpcids=qnKhOb&f.sid=-697906427155521722&bl=boq_playuiserver_20190903.08_p0&hl=en&gl=us&authuser&soc-app=121&soc-platform=1&soc-device=1&_reqid=1065213' -X POST -H 'content-type: application/x-www-form-urlencoded;charset=UTF-8' -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)' --data-raw 'f.req=%5B%5B%5B%22UsvDTd%22%2C%22%5Bnull%2Cnull%2C%5B2%2C2%2C%5B100%2Cnull%2Cnull%5D%2Cnull%2C%5B%5D%5D%2C%5B%5C%22co.justimagine.rosebud%5C%22%2C7%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D' --compressed | cut -c6- | grep -o '\[\[\[.*\]\]\]' | sed 's/\\"/"/g' | sed 's/\\\\/\\/g' | jq -r '.[0][] | "[\(.[2])★] \(.[1][0]): \(.[4])"' | sort -r
*/
const url = new URL('https://play.google.com/_/PlayStoreUi/data/batchexecute')
url.searchParams.set('hl', locale)
url.searchParams.set('gl', 'us')
url.searchParams.set('authuser', '')
url.searchParams.set('rpcids', 'qnKhOb')
url.searchParams.set('f.sid', '-697906427155521722')
url.searchParams.set('bl', 'boq_playuiserver_20190903.08_p0')
url.searchParams.set('soc-app', '121')
url.searchParams.set('soc-platform', '1')
url.searchParams.set('soc-device', '1')
url.searchParams.set('_reqid', '1065213')
logger.info(
`Fetching next page of reviews (${chalk.bold(`pageSize=${pageSize}`)}) via ${chalk.bold(url.toString())} ...`,
)
const response = await fetch(url.toString(), {
method: 'POST',
headers: {
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
},
body: nextPageToken
? `f.req=%5B%5B%5B%22UsvDTd%22%2C%22%5Bnull%2Cnull%2C%5B2%2C${sort}%2C%5B${pageSize}%2Cnull%2C%5C%22${nextPageToken}%5C%22%5D%2Cnull%2C%5B%5D%5D%2C%5B%5C%22${packageName}%5C%22%2C7%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D`
: `f.req=%5B%5B%5B%22UsvDTd%22%2C%22%5Bnull%2Cnull%2C%5B2%2C${sort}%2C%5B${pageSize}%2Cnull%2Cnull%5D%2Cnull%2C%5B%5D%5D%2C%5B%5C%22${packageName}%5C%22%2C7%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D`,
})
if (!response.ok) {
const errorText = await response.text()
throw new Error(`HTTP error! Status: ${response.status}, Text: ${errorText}`)
}
const responseText = await response.text()
// Parse the response to get the reviews and the next page token
const responseData = JSON.parse(JSON.parse(responseText.substring(5))[0][2] || `[]`)
if (responseData.length) {
reviews.push(
...responseData[0].map(
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(data: any) => {
const review = {
raw_data: data,
locale,
platform: 'android',
store_id: data[0],
store_url: `https://play.google.com/store/apps/details?id=${packageName}&reviewId=${data[0]}`,
author_name: data[1][0],
author_image_url: data[1][1] ? data[1][1][3][2] : null,
rating: data[2],
title: data[3] || null,
text: data[4],
created_at: new Date(Number(`${data[5][0]}${`${data[5][1] || '000'}`.substring(0, 3)}`)).toISOString(),
app_version: data[10] || null,
thumbs_up: data[6],
...(data[7]
? {
reply_text: data[7][1],
replied_at: new Date(
Number(`${data[7][2][0]}${`${data[7][2][1] || '000'}`.substring(0, 3)}`),
).toISOString(),
}
: {}),
} satisfies Review
return review
},
),
)
}
// Check if we have a next page token and need to fetch more results
nextPageToken = responseData[1]?.[1] // Extract the next page token from the response
if (nextPageToken === undefined) {
break
}
// Wait for the specified throttle time before making the next request
logger.info(`Waiting for ${chalk.bold(`${throttleMs}ms`)} before fetching next page...`)
await new Promise((resolve) => setTimeout(resolve, throttleMs))
}
return reviews
}
async function scrapeAppStoreReviews({
appId,
locale = 'us',
page = 1,
pageSize = 50,
throttleMs = 2000,
}: {
appId: string
locale?: string
page?: number
pageSize?: number
throttleMs?: number
}): Promise<Review[]> {
const reviews: Review[] = []
let currentPage = page
let hasMorePages = true
while (hasMorePages) {
const url = new URL(
`https://itunes.apple.com/${locale}/rss/customerreviews/page=${currentPage}/id=${appId}/sortby=mostrecent/json`,
)
logger.info(
`Fetching page ${chalk.bold(`${currentPage}`)} of reviews (${chalk.bold(`pageSize=${pageSize}`)}) via ${chalk.bold(url.toString())} ...`,
)
const response = await fetch(url.toString(), {
method: 'GET',
headers: {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
},
})
if (!response.ok) {
// If we get a 404, it likely means we've reached the end of available reviews
if (response.status === 404) {
hasMorePages = false
break
}
const errorText = await response.text()
throw new Error(`HTTP error! Status: ${response.status}, Text: ${errorText}`)
}
const responseData = (await response.json()) as {
feed?: {
entry?:
| {
id: { label: string }
'im:rating': { label: string }
updated?: { label: string }
'im:version'?: { label: string }
author: {
name: { label: string }
uri?: { label: string }
}
title?: { label: string }
content?: { label: string }
[key: string]: unknown
}
| {
id: { label: string }
'im:rating': { label: string }
updated?: { label: string }
'im:version'?: { label: string }
author: {
name: { label: string }
uri?: { label: string }
}
title?: { label: string }
content?: { label: string }
[key: string]: unknown
}[]
author?: unknown
updated?: unknown
rights?: unknown
title?: unknown
icon?: unknown
link?: unknown
id?: unknown
[key: string]: unknown
}
[key: string]: unknown
}
// Check if we have entries in the feed
const entries = responseData.feed?.entry || []
// Filter out the first entry if it's the app info (not a review)
// Make sure entries is an array before calling filter
const reviewEntries = Array.isArray(entries)
? entries.filter((entry) => entry['im:rating'])
: entries && typeof entries === 'object' && 'im:rating' in entries
? [entries] // If it's a single object with im:rating, convert to array
: [] // Otherwise, empty array
if (reviewEntries.length === 0) {
hasMorePages = false
break
}
console.log(reviewEntries[0])
reviews.push(
...reviewEntries.map(
(entry) =>
({
raw_data: JSON.parse(JSON.stringify(entry)) as Json,
platform: 'ios',
locale,
store_id: entry.id.label,
store_url: entry.id.label,
author_name: entry.author.name.label,
author_image_url: null, // App Store doesn't provide author images
rating: parseInt(entry['im:rating']?.label || '0', 10),
title: entry.title?.label || null,
text: entry.content?.label || '',
created_at: new Date(entry.updated?.label || new Date().toISOString()).toISOString(),
app_version: entry['im:version']?.label || null,
thumbs_up: 0, // App Store doesn't provide thumbs up count
}) satisfies Review,
),
)
// Increment page for next iteration
currentPage++
// Check if we should continue fetching more pages
// Apple's RSS feed typically has a limit on how many pages we can fetch
if (currentPage > 10 || reviewEntries.length < pageSize) {
hasMorePages = false
}
// Wait for the specified throttle time before making the next request
if (hasMorePages) {
logger.info(`Waiting for ${chalk.bold(`${throttleMs}ms`)} before fetching next page...`)
await new Promise((resolve) => setTimeout(resolve, throttleMs))
}
}
return reviews
}
function logReview(review: Review) {
const stars = `${chalk.yellow(
Array.from({ length: review.rating })
.map(() => '★')
.join(''),
)}${Array.from({ length: 5 - review.rating })
.map(() => '☆')
.join('')}`
logger.info(
`[${chalk.bold(review.platform)}] ${stars} ${chalk.green(review.author_name)}: ${chalk.magenta(`${review.text.substring(0, 50)}...`)}`,
)
}
async function run({ playStorePackageName, appStoreAppId }: { playStorePackageName: string; appStoreAppId: string }) {
const reviews: Review[] = []
try {
// PLAY STORE
// prettier-ignore
const playStoreLocaleCodes = [ "af", "ak", "sq", "am", "ar", "hy", "az", "eu", "be", "bem", "bn", "bh", "xx-bork", "bs", "br", "bg", "km", "ca", "chr", "ny", "zh-cn", "zh-tw", "co", "hr", "cs", "da", "nl", "xx-elmer", "en", "eo", "et", "ee", "fo", "tl", "fi", "fr", "fy", "gaa", "gl", "ka", "de", "el", "kl", "gn", "gu", "xx-hacker", "ht", "ha", "haw", "iw", "hi", "hu", "is", "ig", "id", "ia", "ga", "it", "ja", "jw", "kn", "kk", "rw", "rn", "xx-klingon", "kg", "ko", "kri", "ku", "ckb", "ky", "lo", "la", "lv", "ln", "lt", "loz", "lg", "ach", "mk", "mg", "my", "ms", "ml", "mt", "mv", "mi", "mr", "mfe", "mo", "mn", "sr-me", "ne", "pcm", "nso", "no", "nn", "oc", "or", "om", "ps", "fa", "xx-pirate", "pl", "pt", "pt-br", "pt-pt", "pa", "qu", "ro", "rm", "nyn", "ru", "gd", "sr", "sh", "st", "tn", "crs", "sn","sd", "si", "sk", "sl", "so", "es", "es-419", "su", "sw", "sv", "tg", "ta", "tt", "te", "th", "ti", "to", "lua", "tum", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "vu", "vi", "cy", "wo", "xh", "yi", "yo", "zu"] as const
for (const locale of playStoreLocaleCodes) {
logger.info(chalk.cyan(`Fetching PlayStore reviews for locale ${chalk.cyan.bold(locale)}...`))
reviews.push(
...(await scrapePlayStoreReviews({ packageName: playStorePackageName, locale })).filter((review) => {
if (reviews.some((r) => r.store_id === review.store_id)) return false
logReview(review)
return true
}),
)
}
// APP STORE
// prettier-ignore
const appStoreLocaleCodes = ["ae", "ag", "ai", "al", "am", "ao", "ar", "at", "au", "az", "bb", "be", "bf", "bg", "bh", "bj", "bm", "bn", "bo", "br", "bs", "bt", "bw", "by", "bz", "ca", "cg", "ch", "cl", "cn", "co", "cr", "cv", "cy", "cz", "de", "dk", "dm", "do", "dz", "ec", "ee", "eg", "es", "fi", "fj", "fm", "fr", "gb", "gd", "gh", "gm", "gr", "gt", "gw", "gy", "hk", "hn", "hr", "hu", "id", "ie", "il", "in", "is", "it", "jm", "jo", "jp", "ke", "kg", "kh", "kn", "kr", "kw", "ky", "kz", "la", "lb", "lc", "lk", "lr", "lt", "lu", "lv", "md", "mg", "mk", "ml", "mn", "mo", "mr", "ms", "mt", "mu", "mw", "mx", "my", "mz", "na", "ne", "ng", "ni", "nl", "no", "np", "nz", "om", "pa", "pe", "pg", "ph", "pk", "pl", "pt", "pw", "py", "qa", "ro", "ru", "sa", "sb", "sc", "se", "sg", "si", "sk", "sl", "sn", "sr", "st", "sv", "sz", "tc", "td", "th", "tj", "tm", "tn", "tr", "tt", "tw", "tz", "ua", "ug", "us", "uy", "uz", "vc", "ve", "vg", "vn", "ye", "za", "zw"] as const
for (const locale of appStoreLocaleCodes) {
logger.info(chalk.cyan(`Fetching AppStore reviews for locale ${chalk.cyan.bold(locale)}...`))
reviews.push(
...(await scrapeAppStoreReviews({ appId: appStoreAppId, locale })).filter((review) => {
if (reviews.some((r) => r.store_id === review.store_id)) return false
logReview(review)
return true
}),
)
}
logger.info(chalk.green(`Inserting reviews into database...`))
for (const review of reviews) {
logReview(review)
}
const { error } = await supabaseClient.from('reviews').upsert(Object.values(reviews), { onConflict: 'store_id' })
if (error) {
throw error
}
logger.info('')
logger.info(chalk.green.bold('Scrapping completed!'))
logger.info(`Android reviews: ${chalk.green.bold(reviews.filter((r) => r.platform === 'android').length)}`)
logger.info(`iOS reviews: ${chalk.green.bold(reviews.filter((r) => r.platform === 'ios').length)}`)
logger.info(`Total reviews: ${chalk.magenta.bold(reviews.length)}`)
} catch (err) {
logger.error(err, chalk.red.bold('Scrapping failed'))
process.exit(1)
}
}
run({
playStorePackageName: 'com.pixelheart.wledplus',
appStoreAppId: '6474789652',
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment