Skip to content

Instantly share code, notes, and snippets.

@pwin
Created June 29, 2015 18:15
Show Gist options
  • Save pwin/c0e972e93169f7b3591a to your computer and use it in GitHub Desktop.
Save pwin/c0e972e93169f7b3591a to your computer and use it in GitHub Desktop.
import requests
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import sys
uris = [
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cross_Agency_Strategy',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/High_Level_Support',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Holistic_Metrics',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/User_engagement_and_collaboration_throughout_the_lifecycle',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Organisational-internal_engagement',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Human_Readability_and_Machine_Processing',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Cost_of_Publication',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Stakeholders%E2%80%99_Interests_and_Rights',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Feedback_to_Improve_Quality',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Optimization_for_Search_Engines',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publication_with_Common_Metadata',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Catalogs_and_Indexes',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Encourage_crowdsourcing',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publish_spatial_data_on_the_web',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Monitoring_and_Benchmarking',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_quality_assessment',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Identifying_what_you_already_publish',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Make_the_data_available_in_the_language_people_want_it',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Management_Of_A_Wide_Public_Actors_Network',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Making_Research_Results_Open_For_The_Country',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Using_Business_Process_Paradigm_For_Open_Data_Lifecycle_Management',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Publishing_Statistical_Data_In_Linked_Data_Format',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Supervizor_-_An_Indispensable_Open_Government_Application_(Transparency_Of_Public_Spending)',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Civic_Use_Of_Open_Data',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Publication_Plan',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/A_Federation_Tool_For_Opendata_Portals',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Traffic_Light_System_For_Data_Sharing',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_To_Improve_Sharing_And_Publication_Of_Information_Between_Public_Administrations',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Commercial_Considerations_in_Open_Data_Portal_Design',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Infomediary_Sector_Characteristics',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_2.0_-_Changing_Perspectives',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Open_Data_Business_Model_Patterns_and_Open_Data_Business_Value_Disciplines',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/The_Central_Role_of_Location',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/An_ongoing_open_dialog_in_an_open_data_ecosystem',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Discover_published_information_by_site_scraping',
'http://www.w3.org/2013/share-psi/wiki/Best_Practices/Free_our_maps']
replacements = ['=Title=',
'=Short Description=',
'=Overview=',
'=Why=',
'=Intended Outcome=',
'=Life Cycle Stage=',
'=Possible Approach=',
'=How to Test=',
'=Evidence=',
'=Lifecycle Stage=',
'=Audience=',
'=Related Best Practices=',
'=Tags=',
'=Status=',
'=Intended Audience=',
'nowiki',
'Name of the Share-PSI workshop:',
'Title of the Best Practice:',
'Outline of the best practice',
'Management summary',
'Challenge',
'Solution.',
'Best Practice Identification',
'Why is this a Best Practice?',
'What\'s the impact of the Best Practice?',
'Link to the PSI Directive',
'Why is there a need for this Best Practice?',
'What do you need for this Best Practice?',
'Applicability by other member states?',
'Contact info - record of the person to be contacted for additional information or advice.']
debug=True
for i in uris:
wc = WordCloud(background_color="white", max_words=2000,
stopwords=STOPWORDS.add("data"))
fname = i.split('/')[-1] + '.png'
if debug: print(i + "?action=raw")
text = requests.get(i + "?action=raw").text.lower()
if debug: print(text)
for q in replacements:
text = text.replace(q,'')
if debug: print(text)
wc.generate(text)
# show
plt.imshow(wc)
plt.axis("off")
plt.savefig(fname, dpi=200, figsize=(7,6.5))
#plt.show()
plt.close()
if debug: sys.exit(0)
print("Finished")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment