Last active
July 21, 2017 17:43
-
-
Save GER-NaN/03b36877cae38572d15d106d56e9aea9 to your computer and use it in GitHub Desktop.
Scrape articles and comments
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System.Collections.Generic; | |
| namespace wsbcollector.Article | |
| { | |
| public class MediaEmbed | |
| { | |
| public string content { get; set; } | |
| public int? width { get; set; } | |
| public bool? scrolling { get; set; } | |
| public int? height { get; set; } | |
| } | |
| public class Oembed | |
| { | |
| public string provider_url { get; set; } | |
| public string description { get; set; } | |
| public string title { get; set; } | |
| public int thumbnail_width { get; set; } | |
| public int height { get; set; } | |
| public int width { get; set; } | |
| public string html { get; set; } | |
| public string version { get; set; } | |
| public string provider_name { get; set; } | |
| public string thumbnail_url { get; set; } | |
| public string type { get; set; } | |
| public int thumbnail_height { get; set; } | |
| } | |
| public class SecureMedia | |
| { | |
| public string type { get; set; } | |
| public Oembed oembed { get; set; } | |
| } | |
| public class SecureMediaEmbed | |
| { | |
| public string content { get; set; } | |
| public int? width { get; set; } | |
| public bool? scrolling { get; set; } | |
| public int? height { get; set; } | |
| } | |
| public class Oembed2 | |
| { | |
| public string provider_url { get; set; } | |
| public string description { get; set; } | |
| public string title { get; set; } | |
| public int thumbnail_width { get; set; } | |
| public int height { get; set; } | |
| public int width { get; set; } | |
| public string html { get; set; } | |
| public string version { get; set; } | |
| public string provider_name { get; set; } | |
| public string thumbnail_url { get; set; } | |
| public string type { get; set; } | |
| public int thumbnail_height { get; set; } | |
| } | |
| public class Media | |
| { | |
| public string type { get; set; } | |
| public Oembed2 oembed { get; set; } | |
| } | |
| public class Data2 | |
| { | |
| public bool contest_mode { get; set; } | |
| public object approved_at_utc { get; set; } | |
| public object banned_by { get; set; } | |
| public MediaEmbed media_embed { get; set; } | |
| public string subreddit { get; set; } | |
| public string selftext_html { get; set; } | |
| public string selftext { get; set; } | |
| public object likes { get; set; } | |
| public object suggested_sort { get; set; } | |
| public List<object> user_reports { get; set; } | |
| public SecureMedia secure_media { get; set; } | |
| public string link_flair_text { get; set; } | |
| public string id { get; set; } | |
| public object banned_at_utc { get; set; } | |
| public object view_count { get; set; } | |
| public SecureMediaEmbed secure_media_embed { get; set; } | |
| public bool clicked { get; set; } | |
| public object report_reasons { get; set; } | |
| public string author { get; set; } | |
| public bool saved { get; set; } | |
| public List<object> mod_reports { get; set; } | |
| public bool can_mod_post { get; set; } | |
| public string name { get; set; } | |
| public int score { get; set; } | |
| public object approved_by { get; set; } | |
| public bool over_18 { get; set; } | |
| public string domain { get; set; } | |
| public bool hidden { get; set; } | |
| public string thumbnail { get; set; } | |
| public string subreddit_id { get; set; } | |
| public object edited { get; set; } | |
| public string link_flair_css_class { get; set; } | |
| public object author_flair_css_class { get; set; } | |
| public int gilded { get; set; } | |
| public int downs { get; set; } | |
| public bool brand_safe { get; set; } | |
| public bool archived { get; set; } | |
| public object removal_reason { get; set; } | |
| public bool can_gild { get; set; } | |
| public bool is_self { get; set; } | |
| public bool hide_score { get; set; } | |
| public bool spoiler { get; set; } | |
| public string permalink { get; set; } | |
| public object num_reports { get; set; } | |
| public bool locked { get; set; } | |
| public bool stickied { get; set; } | |
| public double created { get; set; } | |
| public string url { get; set; } | |
| public string author_flair_text { get; set; } | |
| public bool quarantine { get; set; } | |
| public string title { get; set; } | |
| public double created_utc { get; set; } | |
| public string subreddit_name_prefixed { get; set; } | |
| public object distinguished { get; set; } | |
| public Media media { get; set; } | |
| public int num_comments { get; set; } | |
| public bool visited { get; set; } | |
| public string subreddit_type { get; set; } | |
| public bool is_video { get; set; } | |
| public int ups { get; set; } | |
| public bool? author_cakeday { get; set; } | |
| } | |
| public class Child | |
| { | |
| public string kind { get; set; } | |
| public Data2 data { get; set; } | |
| } | |
| public class Data | |
| { | |
| public string modhash { get; set; } | |
| public List<Child> children { get; set; } | |
| public string after { get; set; } | |
| public object before { get; set; } | |
| } | |
| public class Articles | |
| { | |
| public string kind { get; set; } | |
| public Data data { get; set; } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System.Collections.Generic; | |
| using System.Web.Helpers; | |
| namespace wsbcollector.Comment | |
| { | |
| public class MediaEmbed | |
| { | |
| } | |
| public class SecureMediaEmbed | |
| { | |
| } | |
| public class Data2 | |
| { | |
| public bool contest_mode { get; set; } | |
| public object approved_at_utc { get; set; } | |
| public object banned_by { get; set; } | |
| public MediaEmbed media_embed { get; set; } | |
| public string subreddit { get; set; } | |
| public string selftext_html { get; set; } | |
| public string selftext { get; set; } | |
| public object likes { get; set; } | |
| public object suggested_sort { get; set; } | |
| public List<object> user_reports { get; set; } | |
| public object secure_media { get; set; } | |
| public bool saved { get; set; } | |
| public string id { get; set; } | |
| public object banned_at_utc { get; set; } | |
| public object view_count { get; set; } | |
| public SecureMediaEmbed secure_media_embed { get; set; } | |
| public bool clicked { get; set; } | |
| public object report_reasons { get; set; } | |
| public string author { get; set; } | |
| public string link_flair_text { get; set; } | |
| public bool can_mod_post { get; set; } | |
| public int score { get; set; } | |
| public object approved_by { get; set; } | |
| public bool over_18 { get; set; } | |
| public string domain { get; set; } | |
| public bool hidden { get; set; } | |
| public int num_comments { get; set; } | |
| public string thumbnail { get; set; } | |
| public string subreddit_id { get; set; } | |
| public bool edited { get; set; } | |
| public string link_flair_css_class { get; set; } | |
| public object author_flair_css_class { get; set; } | |
| public int gilded { get; set; } | |
| public int downs { get; set; } | |
| public bool brand_safe { get; set; } | |
| public bool archived { get; set; } | |
| public object removal_reason { get; set; } | |
| public bool stickied { get; set; } | |
| public bool can_gild { get; set; } | |
| public bool is_self { get; set; } | |
| public bool hide_score { get; set; } | |
| public bool spoiler { get; set; } | |
| public string permalink { get; set; } | |
| public string subreddit_type { get; set; } | |
| public bool locked { get; set; } | |
| public string name { get; set; } | |
| public double created { get; set; } | |
| public string url { get; set; } | |
| public string author_flair_text { get; set; } | |
| public bool quarantine { get; set; } | |
| public string title { get; set; } | |
| public double created_utc { get; set; } | |
| public string subreddit_name_prefixed { get; set; } | |
| public int ups { get; set; } | |
| public object media { get; set; } | |
| public double upvote_ratio { get; set; } | |
| public List<object> mod_reports { get; set; } | |
| public bool visited { get; set; } | |
| public object num_reports { get; set; } | |
| public bool is_video { get; set; } | |
| public object distinguished { get; set; } | |
| public string link_id { get; set; } | |
| public object replies { get; set; } | |
| public string parent_id { get; set; } | |
| public int? controversiality { get; set; } | |
| public string body { get; set; } | |
| public object collapsed_reason { get; set; } | |
| public string body_html { get; set; } | |
| public bool? score_hidden { get; set; } | |
| public bool? collapsed { get; set; } | |
| public int? depth { get; set; } | |
| } | |
| public class Child | |
| { | |
| public string kind { get; set; } | |
| public Data2 data { get; set; } | |
| } | |
| public class Data | |
| { | |
| public string modhash { get; set; } | |
| public List<Child> children { get; set; } | |
| public object after { get; set; } | |
| public object before { get; set; } | |
| } | |
| public class RootObject | |
| { | |
| public string kind { get; set; } | |
| public Data data { get; set; } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System; | |
| using System.Collections.Generic; | |
| using System.Net; | |
| using System.Text.RegularExpressions; | |
| using System.Threading; | |
| using Newtonsoft.Json; | |
| using Newtonsoft.Json.Linq; | |
| using wsbcollector.Article; | |
| using wsbcollector.Comment; | |
| namespace wsbcollector | |
| { | |
| //https://www.reddit.com/dev/api | |
| public class Collector | |
| { | |
| string subUrl = "http://www.reddit.com/r/wallstreetbets/.json?raw_json=1&limit=1000"; | |
| readonly string commentUrl = "https://www.reddit.com/r/wallstreetbets/comments/{ArticleId}/.json?raw_json=1&limit=1000"; | |
| public void Run() | |
| { | |
| Dictionary<string,int> mentionCounts = new Dictionary<string, int>(); | |
| using (WebClient wc = new WebClient()) | |
| { | |
| var json = wc.DownloadString(subUrl); | |
| Articles sub = JsonConvert.DeserializeObject<Articles>(json); | |
| foreach (var child in sub.data.children) | |
| { | |
| Thread.Sleep(2000); | |
| var commentlink = commentUrl.Replace("{ArticleId}", child.data.id); | |
| var commentJson = wc.DownloadString(commentlink); | |
| JArray a = JArray.Parse(commentJson); | |
| foreach (JObject o in a.Children<JObject>()) | |
| { | |
| foreach (JToken result in o["data"]["children"]) | |
| { | |
| Comment.Data2 data = result["data"].ToObject<Comment.Data2>(); | |
| var commentBody = data.body; | |
| var start = commentBody?.IndexOf("$") ?? -1; | |
| if (start >= 0) | |
| { | |
| var spaceIndex = commentBody.IndexOf(" ", start); | |
| if (spaceIndex < 0) | |
| { | |
| spaceIndex = commentBody.Length; | |
| } | |
| var symbol = commentBody.Substring(start, spaceIndex - start); | |
| if (!mentionCounts.ContainsKey(symbol)) | |
| { | |
| mentionCounts.Add(symbol,0); | |
| } | |
| mentionCounts[symbol] += 1; | |
| } | |
| else | |
| { | |
| Console.WriteLine("Symbol not found"); | |
| } | |
| } | |
| } | |
| } | |
| } | |
| Console.ReadLine(); | |
| } | |
| } | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment