Created
December 25, 2019 09:07
-
-
Save NotAdam/bbbe78080af35ac86b1f632325391d40 to your computer and use it in GitHub Desktop.
parse yotpo review garbage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public class ReviewInfo | |
{ | |
public ReviewInfo() | |
{ | |
Questions = new Dictionary<string, string>(); | |
} | |
public string Name { get; set; } | |
public string Date { get; set; } | |
public int Stars { get; set; } | |
public string Title { get; set; } | |
public string Description { get; set; } | |
public string ImageURL { get; set; } | |
public int Upboats { get; set; } | |
public int Downdoots { get; set; } | |
public Dictionary<string, string> Questions {get; set;} | |
} | |
static HttpClient client = new HttpClient(); | |
public List<ReviewInfo> ProcessPage(HtmlDocument penis) | |
{ | |
var items = new List<ReviewInfo>(); | |
var regex = new Regex(@"\d+"); | |
Func<string, int> getInt = (string str) => | |
{ | |
var match = regex.Match(str); | |
if (match.Success) | |
{ | |
return int.Parse(match.Value); | |
} | |
return 0; | |
}; | |
// get reviews | |
var reviews = penis.DocumentNode.SelectNodes("//div[contains(@class, 'yotpo-review')]"); | |
foreach (var review in reviews) | |
{ | |
var author = review.SelectSingleNode(".//span[contains(@class, 'yotpo-user-name')]")?.InnerText.Trim(); | |
var title = review.SelectSingleNode(".//div[contains(@class, 'content-title')]")?.InnerText.Trim(); | |
var content = review.SelectSingleNode(".//div[contains(@class, 'content-review')]")?.InnerText.Trim(); | |
var stars = review.SelectSingleNode(".//span[contains(@class, 'yotpo-review-stars')]/span[contains(@class, 'sr-only')]")?.InnerText.Trim(); | |
var date = review.SelectSingleNode(".//span[contains(@class, 'yotpo-review-date')]")?.InnerText.Trim(); | |
var updoots = review.SelectSingleNode(".//span[contains(@class, 'vote-sum')][@data-type='up']")?.InnerText.Trim(); | |
var downboats = review.SelectSingleNode(".//span[contains(@class, 'vote-sum')][@data-type='down']")?.InnerText.Trim(); | |
if (string.IsNullOrEmpty(author)) | |
{ | |
continue; | |
} | |
var ri = new ReviewInfo | |
{ | |
Name = author, | |
Title = title, | |
Description = content, | |
Stars = getInt(stars), | |
Upboats = getInt(updoots), | |
Downdoots = getInt(downboats), | |
Date = date | |
}; | |
var img = review.SelectSingleNode(".//img[contains(@class, 'image-review media-review')]"); | |
if (img != null) | |
{ | |
// fuckin | |
ri.ImageURL = "https:" + img.Attributes["data-original-src"].Value.ToString(); | |
} | |
var questions = review.SelectNodes(".//div[@class=\"yotpo-question-field\"]"); | |
if (questions != null) | |
{ | |
foreach (var question in questions) | |
{ | |
var q = question.SelectSingleNode(".//div[@class=\"yotpo-question-field-description\"]").InnerText.Trim().Replace(":", ""); | |
var ans = question.SelectSingleNode(".//div[@class=\"yotpo-question-field-answer\"]").InnerText.Trim(); | |
ri.Questions.Add(q, ans); | |
} | |
} | |
items.Add(ri); | |
} | |
return items; | |
} | |
public async Task<string> GetPage(string productId, int page) | |
{ | |
var shit = new Dictionary<string, string> | |
{ | |
{ "methods", @"[{'method':'reviews','params':{'pid':'FUCK','order_metadata_fields':{},'index':0,'data_source':'default','page':SHIT,'host-widget':'main_widget','is_mobile':false,'pictures_per_review':10}}]".Replace("FUCK", productId).Replace("SHIT", page.ToString()).Replace("'", "\"") }, | |
{ "app_key", "12NaMZHdGcfHqPsBaDdcGMthwheuD4jzEUCrYzeV" }, | |
{ "is_mobile", "false" }, | |
{ "widget_version", "2019-12-23_14-03-36" } | |
}; | |
var res = await client.PostAsync("https://staticw2.yotpo.com/batch", new FormUrlEncodedContent(shit)); | |
return await res.Content.ReadAsStringAsync(); | |
} | |
public async Task<List<ReviewInfo>> GetReviews(string productId) | |
{ | |
var fugg = await GetPage(productId, 1); | |
var obj = Newtonsoft.Json.Linq.JArray.Parse(fugg)[0]["result"]; | |
var penis = new HtmlDocument(); | |
penis.LoadHtml(obj.ToString()); | |
// find page count | |
var nodes = penis.DocumentNode.SelectNodes("//a[contains(@class, 'yotpo-page-element')]"); | |
// get 2nd last elem cause last is empty???? | |
var lastNode = nodes[nodes.Count - 2]; | |
var pageCount = 1; | |
if (lastNode != null) | |
{ | |
var raw = lastNode.InnerText.Trim(); | |
pageCount = int.Parse(raw); | |
} | |
var reviews = new List<ReviewInfo>(); | |
// process page we already have | |
reviews.AddRange(ProcessPage(penis)); | |
var taskStore = new List<Task<string>>(); | |
for (int i = 2; i <= pageCount; i++) | |
{ | |
taskStore.Add(GetPage(productId, i)); | |
} | |
var res = await Task.WhenAll(taskStore).ConfigureAwait(false); | |
taskStore.ForEach(s => | |
{ | |
var doc = new HtmlDocument(); | |
var obj = Newtonsoft.Json.Linq.JArray.Parse(s.Result)[0]["result"]; | |
doc.LoadHtml(obj.ToString()); | |
var parsed = ProcessPage(doc); | |
reviews.AddRange(parsed); | |
}); | |
return reviews; | |
} | |
async void Main() | |
{ | |
var reviews = await GetReviews("701-04410"); | |
reviews.Dump(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment