Skip to content

Instantly share code, notes, and snippets.

@do-me
Created May 25, 2019 11:29
Show Gist options
  • Save do-me/9c27f9676918874c132d9b3a075ecec0 to your computer and use it in GitHub Desktop.
Save do-me/9c27f9676918874c132d9b3a075ecec0 to your computer and use it in GitHub Desktop.
1. # Tweets to sentiment of tokenized row items (average)
2. # ATTENTION: BUGS WITH \x OR OTHER SIMILAR CHARACTERS (\n IS REPLACED ALREADY)
3.
4. import pandas as pd
5. import re
6. import os
7. os.chdir("C:/Users/Dome/Desktop/nu/Tweets/")
8.
9. party= "fdp"
10. df=pd.read_json(party+".json")
11.
12. from nltk.tokenize import TweetTokenizer
13. from stop_words import get_stop_words
14. sw = get_stop_words('de')
15. tt = TweetTokenizer()
16.
17. df['tokens'] = df['text'].apply(tt.tokenize)
18.
19. # we want to keep # and @ words but remove marks and stopwords
20. df['clean'] = df['tokens'].apply(lambda x: [item for item in x if
21. item.isalpha() and item.lower() not in sw or re.compile(r"@|#").match(item)
22. and item.lower() not in sw])
23.
24. ###### clean df! ######
25. search_values = ["migr","flücht","auslä","asyl","flucht","immigr",
26. "refugee", "geflüchte", "ausland", "zuwander", "zugewandert"]
27. mig=df[df.text.str.contains('|'.join(search_values ))] # regex search
28.
29. import string
30. mig['plaintext'] = mig['text'].str.translate(str.maketrans('', '', string.punctuation))
31.
32. #SPD Example problem with backslash, only manually resolvable
33. #mig.plaintext[634]= # clean string manually
34.
35. mig=mig.replace({r'\n': ''}, regex=True)
36.
37. #mig['plaintext']=mig['plaintext'].str.replace(r"\\"," ", regex= True)
38. #DOESNT work due to bug in python
39.
40. # python bug https://stackoverflow.com/questions/3675144/regex-error-nothing-to-repeat
41. # due to this bug, cant replace \ character.
42. # Workaround: remove \ manually with other expression or as above
43.
44. ###########################################################
45. # self-defined smallazure function
46.
47. # ohne doppelanführungszeichen!!!!
48. def smallazure(x): # x ist liste!
49.
50. #create documents list item:
51. listi=[]
52. counter=1
53. for i in x:
54. if len(i)>5000:
55. print("One of your list items had more than 5000 characters. Can´t process.")
56. break
57. listi.append("{'id': '" + str(counter)+ "', 'language': 'de', 'text': '" + i+"'}")
58. counter+=1
59. final="["
60. for u in listi:
61. if u==listi[len(listi)-1]:
62. final+= u +"]"
63. break
64. final+= u + ", " #.replace('["{','[{').replace('}"]','}]'))
65.
66. import ast # removes annoying quotation marks that cant be removed
67.
68. docs = {'documents' : ast.literal_eval(final)}
69.
70. from pprint import pprint # pretty output print in console
71.
72. import requests
73. subscription_key = 'XXXXXXX'# api key
74.
75. # microsoft azure
76. assert subscription_key
77. text_analytics_base_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.1/"
78. sentiment_api_url = text_analytics_base_url + "sentiment"
79.
80. headers = {"Ocp-Apim-Subscription-Key": subscription_key}
81. response = requests.post(sentiment_api_url, headers=headers, json=docs)
82. sentiments = response.json()
83. pprint(sentiments)
84. retlist=[]
85. for e in range(0,len(x)): # loop for appending values
86. retlist.append(sentiments["documents"][e]["score"])
87. return retlist
88.
89. from statistics import mean # calculate mean
90. mean(smallazure(list(mig.plaintext)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment