baditaflorin · September 23, 2017 01:05
diff --git a/filter_and_extract_posts_that_have_a_specific_tag_in_medium_com.sql b/filter_and_extract_posts_that_have_a_specific_tag_in_medium_com.sql
 ---------------------------SELECT --------------------------------------------
 select mps.user_username, -- 1st column
 mps.article_url,          -- 2nd column
 mps.image_count,          -- 3rd column
 mps.post_tags,            -- 4th column
 mps.recommends,           -- 5th column 
 mps.reading_time,         -- 6th column
 mps.title,                -- 7th column
 mpl.link_count,           -- 8th column    - we get this data from the left join, were we do a subquery
 '' full_text,             -- dummy 9th column
 -- comment the row above and uncomment the line below to also get the text
 -- mptxt.text full_text,   -- 9th column

 -- v0.6 added month and year information for each article. Useful to know if the tag becomes more/less used in time.  
 date_part('month',mps.created_at), -- 10th column
 date_part('year',mps.created_at),  -- 11th column
 mps.responses_created_count,       -- 12th column 
 mps.word_count,                    -- 13th column
 mps.email_snippet,                 -- 14th column
 mps.collection_name,               -- 15th column
 mps.detected_language              -- 16th column

 ---------------------------FROM -----------------------------------------------
 -- We load the medium_posts_stats as a subquery so that we can add a limit when we want to test features, instead of waiting for the whole database to load
 from (
 select * from medium_posts_stats mps --limit 50000
 ) mps

 -- Use a Left join, untl now i was doing inner join, and this is why some of he articles were missing. Sorry for all the persons that were not included because of this mistake.
 -- I will not redo the posts that I did until now, because it would take to much to do this. From now on, I will use this method to get the posts)
 left join 
 -- We do a count to get the number of links from the medium_posts_links, because medium_posts_stats does not have this information by default.
 (select count(*) link_count,post_id 
 from medium_posts_links mpl group by mpl.post_id) mpl 
 on (mpl.post_id = mps.post_id)

 -- uncomment the line below to get also the text
 -- left join medium_posts_text mptxt on mptxt.post_id = mpl.post_id

 ------------------------------------ FILTER -----------------------------------
 -- We ignore the posts that don`t have any tags.
 where length(post_tags) > 2 
 -- If we want to filter and get the results from a specific post.
 and post_tags ilike '%Venture Capital%'
 -- TODO make the recommends be decimal by default
 order by CAST(recommends as decimal) desc
 -- limit 300
	---------------------------SELECT --------------------------------------------
	select mps.user_username, -- 1st column
	mps.article_url, -- 2nd column
	mps.image_count, -- 3rd column
	mps.post_tags, -- 4th column
	mps.recommends, -- 5th column
	mps.reading_time, -- 6th column
	mps.title, -- 7th column
	mpl.link_count, -- 8th column - we get this data from the left join, were we do a subquery
	'' full_text, -- dummy 9th column
	-- comment the row above and uncomment the line below to also get the text
	-- mptxt.text full_text, -- 9th column

	-- v0.6 added month and year information for each article. Useful to know if the tag becomes more/less used in time.
	date_part('month',mps.created_at), -- 10th column
	date_part('year',mps.created_at), -- 11th column
	mps.responses_created_count, -- 12th column
	mps.word_count, -- 13th column
	mps.email_snippet, -- 14th column
	mps.collection_name, -- 15th column
	mps.detected_language -- 16th column

	---------------------------FROM -----------------------------------------------
	-- We load the medium_posts_stats as a subquery so that we can add a limit when we want to test features, instead of waiting for the whole database to load
	from (
	select * from medium_posts_stats mps --limit 50000
	) mps

	-- Use a Left join, untl now i was doing inner join, and this is why some of he articles were missing. Sorry for all the persons that were not included because of this mistake.
	-- I will not redo the posts that I did until now, because it would take to much to do this. From now on, I will use this method to get the posts)
	left join
	-- We do a count to get the number of links from the medium_posts_links, because medium_posts_stats does not have this information by default.
	(select count(*) link_count,post_id
	from medium_posts_links mpl group by mpl.post_id) mpl
	on (mpl.post_id = mps.post_id)

	-- uncomment the line below to get also the text
	-- left join medium_posts_text mptxt on mptxt.post_id = mpl.post_id

	------------------------------------ FILTER -----------------------------------
	-- We ignore the posts that don`t have any tags.
	where length(post_tags) > 2
	-- If we want to filter and get the results from a specific post.
	and post_tags ilike '%Venture Capital%'
	-- TODO make the recommends be decimal by default
	order by CAST(recommends as decimal) desc
	-- limit 300