do-me · May 25, 2019 11:29
diff --git a/Tweets2sentimentmean.py b/Tweets2sentimentmean.py
 1.	# Tweets to sentiment of tokenized row items (average)  
 2.	# ATTENTION: BUGS WITH \x OR OTHER SIMILAR CHARACTERS (\n IS REPLACED ALREADY)  
 3.	  
 4.	import pandas as pd  
 5.	import re  
 6.	import os  
 7.	os.chdir("C:/Users/Dome/Desktop/nu/Tweets/")  
 8.	  
 9.	party= "fdp"  
 10.	df=pd.read_json(party+".json")
 11.	  
 12.	from nltk.tokenize import TweetTokenizer  
 13.	from stop_words import get_stop_words  
 14.	sw = get_stop_words('de')  
 15.	tt = TweetTokenizer()  
 16.	  
 17.	df['tokens'] = df['text'].apply(tt.tokenize)   
 18.	  
 19.	# we want to keep # and @ words but remove marks and stopwords  
 20.	df['clean'] = df['tokens'].apply(lambda x: [item for item in x if   
 21.	  item.isalpha() and item.lower() not in sw or re.compile(r"@|#").match(item)  
 22.	          and item.lower() not in sw])   
 23.	  
 24.	###### clean df! ######  
 25.	search_values = ["migr","flücht","auslä","asyl","flucht","immigr",  
 26.	                 "refugee", "geflüchte", "ausland", "zuwander", "zugewandert"]  
 27.	mig=df[df.text.str.contains('|'.join(search_values ))] # regex search  
 28.	  
 29.	import string  
 30.	mig['plaintext'] = mig['text'].str.translate(str.maketrans('', '', string.punctuation))   
 31.	  
 32.	#SPD Example problem with backslash, only manually resolvable  
 33.	#mig.plaintext[634]= # clean string manually  
 34.	  
 35.	mig=mig.replace({r'\n': ''}, regex=True)  
 36.	  
 37.	#mig['plaintext']=mig['plaintext'].str.replace(r"\\"," ", regex= True)   
 38.	#DOESNT work due to bug in python  
 39.	  
 40.	# python bug https://stackoverflow.com/questions/3675144/regex-error-nothing-to-repeat  
 41.	# due to this bug, cant replace \ character.  
 42.	# Workaround: remove \ manually with other expression or as above  
 43.	  
 44.	###########################################################  
 45.	# self-defined smallazure function   
 46.	  
 47.	# ohne doppelanführungszeichen!!!!  
 48.	def smallazure(x): # x ist liste!  
 49.	      
 50.	    #create documents list item:  
 51.	    listi=[]  
 52.	    counter=1  
 53.	    for i in x:  
 54.	        if len(i)>5000:  
 55.	            print("One of your list items had more than 5000 characters. Can´t process.")  
 56.	            break  
 57.	        listi.append("{'id': '" + str(counter)+ "', 'language': 'de', 'text': '" + i+"'}")   
 58.	        counter+=1  
 59.	    final="["  
 60.	    for u in listi:  
 61.	        if u==listi[len(listi)-1]:  
 62.	             final+= u +"]"  
 63.	             break  
 64.	        final+= u + ", " #.replace('["{','[{').replace('}"]','}]'))  
 65.	     
 66.	    import ast # removes annoying quotation marks that cant be removed  
 67.	  
 68.	    docs = {'documents' : ast.literal_eval(final)}  
 69.	     
 70.	    from pprint import pprint # pretty output print in console  
 71.	      
 72.	    import requests  
 73.	    subscription_key = 'XXXXXXX'# api key  
 74.	      
 75.	    # microsoft azure  
 76.	    assert subscription_key  
 77.	    text_analytics_base_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.1/"  
 78.	    sentiment_api_url = text_analytics_base_url + "sentiment"  
 79.	      
 80.	    headers   = {"Ocp-Apim-Subscription-Key": subscription_key}  
 81.	    response  = requests.post(sentiment_api_url, headers=headers, json=docs)  
 82.	    sentiments = response.json()     
 83.	    pprint(sentiments)  
 84.	    retlist=[]  
 85.	    for e in range(0,len(x)): # loop for appending values  
 86.	        retlist.append(sentiments["documents"][e]["score"])          
 87.	    return retlist  
 88.	  
 89.	from statistics import mean # calculate mean  
 90.	mean(smallazure(list(mig.plaintext)))
	1. # Tweets to sentiment of tokenized row items (average)
	2. # ATTENTION: BUGS WITH \x OR OTHER SIMILAR CHARACTERS (\n IS REPLACED ALREADY)
	3.
	4. import pandas as pd
	5. import re
	6. import os
	7. os.chdir("C:/Users/Dome/Desktop/nu/Tweets/")
	8.
	9. party= "fdp"
	10. df=pd.read_json(party+".json")
	11.
	12. from nltk.tokenize import TweetTokenizer
	13. from stop_words import get_stop_words
	14. sw = get_stop_words('de')
	15. tt = TweetTokenizer()
	16.
	17. df['tokens'] = df['text'].apply(tt.tokenize)
	18.
	19. # we want to keep # and @ words but remove marks and stopwords
	20. df['clean'] = df['tokens'].apply(lambda x: [item for item in x if
	21. item.isalpha() and item.lower() not in sw or re.compile(r"@\|#").match(item)
	22. and item.lower() not in sw])
	23.
	24. ###### clean df! ######
	25. search_values = ["migr","flücht","auslä","asyl","flucht","immigr",
	26. "refugee", "geflüchte", "ausland", "zuwander", "zugewandert"]
	27. mig=df[df.text.str.contains('\|'.join(search_values ))] # regex search
	28.
	29. import string
	30. mig['plaintext'] = mig['text'].str.translate(str.maketrans('', '', string.punctuation))
	31.
	32. #SPD Example problem with backslash, only manually resolvable
	33. #mig.plaintext[634]= # clean string manually
	34.
	35. mig=mig.replace({r'\n': ''}, regex=True)
	36.
	37. #mig['plaintext']=mig['plaintext'].str.replace(r"\\"," ", regex= True)
	38. #DOESNT work due to bug in python
	39.
	40. # python bug https://stackoverflow.com/questions/3675144/regex-error-nothing-to-repeat
	41. # due to this bug, cant replace \ character.
	42. # Workaround: remove \ manually with other expression or as above
	43.
	44. ###########################################################
	45. # self-defined smallazure function
	46.
	47. # ohne doppelanführungszeichen!!!!
	48. def smallazure(x): # x ist liste!
	49.
	50. #create documents list item:
	51. listi=[]
	52. counter=1
	53. for i in x:
	54. if len(i)>5000:
	55. print("One of your list items had more than 5000 characters. Can´t process.")
	56. break
	57. listi.append("{'id': '" + str(counter)+ "', 'language': 'de', 'text': '" + i+"'}")
	58. counter+=1
	59. final="["
	60. for u in listi:
	61. if u==listi[len(listi)-1]:
	62. final+= u +"]"
	63. break
	64. final+= u + ", " #.replace('["{','[{').replace('}"]','}]'))
	65.
	66. import ast # removes annoying quotation marks that cant be removed
	67.
	68. docs = {'documents' : ast.literal_eval(final)}
	69.
	70. from pprint import pprint # pretty output print in console
	71.
	72. import requests
	73. subscription_key = 'XXXXXXX'# api key
	74.
	75. # microsoft azure
	76. assert subscription_key
	77. text_analytics_base_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.1/"
	78. sentiment_api_url = text_analytics_base_url + "sentiment"
	79.
	80. headers = {"Ocp-Apim-Subscription-Key": subscription_key}
	81. response = requests.post(sentiment_api_url, headers=headers, json=docs)
	82. sentiments = response.json()
	83. pprint(sentiments)
	84. retlist=[]
	85. for e in range(0,len(x)): # loop for appending values
	86. retlist.append(sentiments["documents"][e]["score"])
	87. return retlist
	88.
	89. from statistics import mean # calculate mean
	90. mean(smallazure(list(mig.plaintext)))