Created
May 25, 2019 11:29
-
-
Save do-me/9c27f9676918874c132d9b3a075ecec0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. # Tweets to sentiment of tokenized row items (average) | |
2. # ATTENTION: BUGS WITH \x OR OTHER SIMILAR CHARACTERS (\n IS REPLACED ALREADY) | |
3. | |
4. import pandas as pd | |
5. import re | |
6. import os | |
7. os.chdir("C:/Users/Dome/Desktop/nu/Tweets/") | |
8. | |
9. party= "fdp" | |
10. df=pd.read_json(party+".json") | |
11. | |
12. from nltk.tokenize import TweetTokenizer | |
13. from stop_words import get_stop_words | |
14. sw = get_stop_words('de') | |
15. tt = TweetTokenizer() | |
16. | |
17. df['tokens'] = df['text'].apply(tt.tokenize) | |
18. | |
19. # we want to keep # and @ words but remove marks and stopwords | |
20. df['clean'] = df['tokens'].apply(lambda x: [item for item in x if | |
21. item.isalpha() and item.lower() not in sw or re.compile(r"@|#").match(item) | |
22. and item.lower() not in sw]) | |
23. | |
24. ###### clean df! ###### | |
25. search_values = ["migr","flücht","auslä","asyl","flucht","immigr", | |
26. "refugee", "geflüchte", "ausland", "zuwander", "zugewandert"] | |
27. mig=df[df.text.str.contains('|'.join(search_values ))] # regex search | |
28. | |
29. import string | |
30. mig['plaintext'] = mig['text'].str.translate(str.maketrans('', '', string.punctuation)) | |
31. | |
32. #SPD Example problem with backslash, only manually resolvable | |
33. #mig.plaintext[634]= # clean string manually | |
34. | |
35. mig=mig.replace({r'\n': ''}, regex=True) | |
36. | |
37. #mig['plaintext']=mig['plaintext'].str.replace(r"\\"," ", regex= True) | |
38. #DOESNT work due to bug in python | |
39. | |
40. # python bug https://stackoverflow.com/questions/3675144/regex-error-nothing-to-repeat | |
41. # due to this bug, cant replace \ character. | |
42. # Workaround: remove \ manually with other expression or as above | |
43. | |
44. ########################################################### | |
45. # self-defined smallazure function | |
46. | |
47. # ohne doppelanführungszeichen!!!! | |
48. def smallazure(x): # x ist liste! | |
49. | |
50. #create documents list item: | |
51. listi=[] | |
52. counter=1 | |
53. for i in x: | |
54. if len(i)>5000: | |
55. print("One of your list items had more than 5000 characters. Can´t process.") | |
56. break | |
57. listi.append("{'id': '" + str(counter)+ "', 'language': 'de', 'text': '" + i+"'}") | |
58. counter+=1 | |
59. final="[" | |
60. for u in listi: | |
61. if u==listi[len(listi)-1]: | |
62. final+= u +"]" | |
63. break | |
64. final+= u + ", " #.replace('["{','[{').replace('}"]','}]')) | |
65. | |
66. import ast # removes annoying quotation marks that cant be removed | |
67. | |
68. docs = {'documents' : ast.literal_eval(final)} | |
69. | |
70. from pprint import pprint # pretty output print in console | |
71. | |
72. import requests | |
73. subscription_key = 'XXXXXXX'# api key | |
74. | |
75. # microsoft azure | |
76. assert subscription_key | |
77. text_analytics_base_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.1/" | |
78. sentiment_api_url = text_analytics_base_url + "sentiment" | |
79. | |
80. headers = {"Ocp-Apim-Subscription-Key": subscription_key} | |
81. response = requests.post(sentiment_api_url, headers=headers, json=docs) | |
82. sentiments = response.json() | |
83. pprint(sentiments) | |
84. retlist=[] | |
85. for e in range(0,len(x)): # loop for appending values | |
86. retlist.append(sentiments["documents"][e]["score"]) | |
87. return retlist | |
88. | |
89. from statistics import mean # calculate mean | |
90. mean(smallazure(list(mig.plaintext))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment