Skip to content

Instantly share code, notes, and snippets.

@GER-NaN
Last active July 21, 2017 17:43
Show Gist options
  • Select an option

  • Save GER-NaN/03b36877cae38572d15d106d56e9aea9 to your computer and use it in GitHub Desktop.

Select an option

Save GER-NaN/03b36877cae38572d15d106d56e9aea9 to your computer and use it in GitHub Desktop.
Scrape articles and comments
using System.Collections.Generic;
namespace wsbcollector.Article
{
public class MediaEmbed
{
public string content { get; set; }
public int? width { get; set; }
public bool? scrolling { get; set; }
public int? height { get; set; }
}
public class Oembed
{
public string provider_url { get; set; }
public string description { get; set; }
public string title { get; set; }
public int thumbnail_width { get; set; }
public int height { get; set; }
public int width { get; set; }
public string html { get; set; }
public string version { get; set; }
public string provider_name { get; set; }
public string thumbnail_url { get; set; }
public string type { get; set; }
public int thumbnail_height { get; set; }
}
public class SecureMedia
{
public string type { get; set; }
public Oembed oembed { get; set; }
}
public class SecureMediaEmbed
{
public string content { get; set; }
public int? width { get; set; }
public bool? scrolling { get; set; }
public int? height { get; set; }
}
public class Oembed2
{
public string provider_url { get; set; }
public string description { get; set; }
public string title { get; set; }
public int thumbnail_width { get; set; }
public int height { get; set; }
public int width { get; set; }
public string html { get; set; }
public string version { get; set; }
public string provider_name { get; set; }
public string thumbnail_url { get; set; }
public string type { get; set; }
public int thumbnail_height { get; set; }
}
public class Media
{
public string type { get; set; }
public Oembed2 oembed { get; set; }
}
public class Data2
{
public bool contest_mode { get; set; }
public object approved_at_utc { get; set; }
public object banned_by { get; set; }
public MediaEmbed media_embed { get; set; }
public string subreddit { get; set; }
public string selftext_html { get; set; }
public string selftext { get; set; }
public object likes { get; set; }
public object suggested_sort { get; set; }
public List<object> user_reports { get; set; }
public SecureMedia secure_media { get; set; }
public string link_flair_text { get; set; }
public string id { get; set; }
public object banned_at_utc { get; set; }
public object view_count { get; set; }
public SecureMediaEmbed secure_media_embed { get; set; }
public bool clicked { get; set; }
public object report_reasons { get; set; }
public string author { get; set; }
public bool saved { get; set; }
public List<object> mod_reports { get; set; }
public bool can_mod_post { get; set; }
public string name { get; set; }
public int score { get; set; }
public object approved_by { get; set; }
public bool over_18 { get; set; }
public string domain { get; set; }
public bool hidden { get; set; }
public string thumbnail { get; set; }
public string subreddit_id { get; set; }
public object edited { get; set; }
public string link_flair_css_class { get; set; }
public object author_flair_css_class { get; set; }
public int gilded { get; set; }
public int downs { get; set; }
public bool brand_safe { get; set; }
public bool archived { get; set; }
public object removal_reason { get; set; }
public bool can_gild { get; set; }
public bool is_self { get; set; }
public bool hide_score { get; set; }
public bool spoiler { get; set; }
public string permalink { get; set; }
public object num_reports { get; set; }
public bool locked { get; set; }
public bool stickied { get; set; }
public double created { get; set; }
public string url { get; set; }
public string author_flair_text { get; set; }
public bool quarantine { get; set; }
public string title { get; set; }
public double created_utc { get; set; }
public string subreddit_name_prefixed { get; set; }
public object distinguished { get; set; }
public Media media { get; set; }
public int num_comments { get; set; }
public bool visited { get; set; }
public string subreddit_type { get; set; }
public bool is_video { get; set; }
public int ups { get; set; }
public bool? author_cakeday { get; set; }
}
public class Child
{
public string kind { get; set; }
public Data2 data { get; set; }
}
public class Data
{
public string modhash { get; set; }
public List<Child> children { get; set; }
public string after { get; set; }
public object before { get; set; }
}
public class Articles
{
public string kind { get; set; }
public Data data { get; set; }
}
}
using System.Collections.Generic;
using System.Web.Helpers;
namespace wsbcollector.Comment
{
public class MediaEmbed
{
}
public class SecureMediaEmbed
{
}
public class Data2
{
public bool contest_mode { get; set; }
public object approved_at_utc { get; set; }
public object banned_by { get; set; }
public MediaEmbed media_embed { get; set; }
public string subreddit { get; set; }
public string selftext_html { get; set; }
public string selftext { get; set; }
public object likes { get; set; }
public object suggested_sort { get; set; }
public List<object> user_reports { get; set; }
public object secure_media { get; set; }
public bool saved { get; set; }
public string id { get; set; }
public object banned_at_utc { get; set; }
public object view_count { get; set; }
public SecureMediaEmbed secure_media_embed { get; set; }
public bool clicked { get; set; }
public object report_reasons { get; set; }
public string author { get; set; }
public string link_flair_text { get; set; }
public bool can_mod_post { get; set; }
public int score { get; set; }
public object approved_by { get; set; }
public bool over_18 { get; set; }
public string domain { get; set; }
public bool hidden { get; set; }
public int num_comments { get; set; }
public string thumbnail { get; set; }
public string subreddit_id { get; set; }
public bool edited { get; set; }
public string link_flair_css_class { get; set; }
public object author_flair_css_class { get; set; }
public int gilded { get; set; }
public int downs { get; set; }
public bool brand_safe { get; set; }
public bool archived { get; set; }
public object removal_reason { get; set; }
public bool stickied { get; set; }
public bool can_gild { get; set; }
public bool is_self { get; set; }
public bool hide_score { get; set; }
public bool spoiler { get; set; }
public string permalink { get; set; }
public string subreddit_type { get; set; }
public bool locked { get; set; }
public string name { get; set; }
public double created { get; set; }
public string url { get; set; }
public string author_flair_text { get; set; }
public bool quarantine { get; set; }
public string title { get; set; }
public double created_utc { get; set; }
public string subreddit_name_prefixed { get; set; }
public int ups { get; set; }
public object media { get; set; }
public double upvote_ratio { get; set; }
public List<object> mod_reports { get; set; }
public bool visited { get; set; }
public object num_reports { get; set; }
public bool is_video { get; set; }
public object distinguished { get; set; }
public string link_id { get; set; }
public object replies { get; set; }
public string parent_id { get; set; }
public int? controversiality { get; set; }
public string body { get; set; }
public object collapsed_reason { get; set; }
public string body_html { get; set; }
public bool? score_hidden { get; set; }
public bool? collapsed { get; set; }
public int? depth { get; set; }
}
public class Child
{
public string kind { get; set; }
public Data2 data { get; set; }
}
public class Data
{
public string modhash { get; set; }
public List<Child> children { get; set; }
public object after { get; set; }
public object before { get; set; }
}
public class RootObject
{
public string kind { get; set; }
public Data data { get; set; }
}
}
using System;
using System.Collections.Generic;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using wsbcollector.Article;
using wsbcollector.Comment;
namespace wsbcollector
{
//https://www.reddit.com/dev/api
public class Collector
{
string subUrl = "http://www.reddit.com/r/wallstreetbets/.json?raw_json=1&limit=1000";
readonly string commentUrl = "https://www.reddit.com/r/wallstreetbets/comments/{ArticleId}/.json?raw_json=1&limit=1000";
public void Run()
{
Dictionary<string,int> mentionCounts = new Dictionary<string, int>();
using (WebClient wc = new WebClient())
{
var json = wc.DownloadString(subUrl);
Articles sub = JsonConvert.DeserializeObject<Articles>(json);
foreach (var child in sub.data.children)
{
Thread.Sleep(2000);
var commentlink = commentUrl.Replace("{ArticleId}", child.data.id);
var commentJson = wc.DownloadString(commentlink);
JArray a = JArray.Parse(commentJson);
foreach (JObject o in a.Children<JObject>())
{
foreach (JToken result in o["data"]["children"])
{
Comment.Data2 data = result["data"].ToObject<Comment.Data2>();
var commentBody = data.body;
var start = commentBody?.IndexOf("$") ?? -1;
if (start >= 0)
{
var spaceIndex = commentBody.IndexOf(" ", start);
if (spaceIndex < 0)
{
spaceIndex = commentBody.Length;
}
var symbol = commentBody.Substring(start, spaceIndex - start);
if (!mentionCounts.ContainsKey(symbol))
{
mentionCounts.Add(symbol,0);
}
mentionCounts[symbol] += 1;
}
else
{
Console.WriteLine("Symbol not found");
}
}
}
}
}
Console.ReadLine();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment