Created
July 28, 2020 12:57
-
-
Save carlleston/01e0d7cf121fbfcc7b62d59148ed8ce3 to your computer and use it in GitHub Desktop.
Extract, transform, classify the news and load.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## ETL News" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# @hidden_cell\n", | |
"# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.\n", | |
"from project_lib import Project\n", | |
"project = Project(project_id='********-****-******-****-**********', project_access_token='p-************************************')\n", | |
"pc = project.project_context" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"!pip install google-api-python-client==1.8.3\n", | |
"!pip install scrapy==2.1.0\n", | |
"!pip install attrs==19.3.0\n", | |
"!pip install scikit-learn==0.22\n", | |
"!pip install h5py==2.10.0\n", | |
"!pip install tensorflow==2.2.0rc0\n", | |
"!pip install keras==2.3.1\n", | |
"!pip install nltk==3.4" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Using TensorFlow backend.\n" | |
] | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"## Google Search API\n", | |
"from googleapiclient.discovery import build\n", | |
"\n", | |
"### Crawler Scrapy\n", | |
"import unicodedata\n", | |
"import scrapy\n", | |
"import requests\n", | |
"from scrapy.http import TextResponse\n", | |
"import json\n", | |
"import string\n", | |
"from nltk import word_tokenize\n", | |
"from nltk.corpus import stopwords\n", | |
"\n", | |
"### RNN Classifier\n", | |
"from keras.preprocessing.text import Tokenizer\n", | |
"from keras.preprocessing.sequence import pad_sequences\n", | |
"\n", | |
"from sklearn import preprocessing\n", | |
"\n", | |
"import nltk\n", | |
"import unicodedata\n", | |
"import string\n", | |
"import re\n", | |
"import seaborn as sns\n", | |
"import itertools\n", | |
"import pickle\n", | |
"\n", | |
"from nltk import word_tokenize\n", | |
"from nltk.corpus import stopwords\n", | |
"from keras.models import load_model\n", | |
"\n", | |
"DIM_GLOVE = 50\n", | |
"n_words = 20000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"## Google api\n", | |
"from googleapiclient.discovery import build\n", | |
"\n", | |
"\n", | |
"api_key = \"*********************************\"\n", | |
"\n", | |
"my_api_key = \"**********************************\"\n", | |
"my_cse_id = \"*******************************\" ## Custom Search element control API\n", | |
"\n", | |
"def google_search(search_term, api_key, cse_id, **kwargs):\n", | |
" service = build(\"customsearch\", \"v1\", developerKey=api_key)\n", | |
" res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()\n", | |
" return res" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"WARNING: Logging before flag parsing goes to stderr.\n", | |
"W0703 03:26:29.753084 139768582977344 __init__.py:46] file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n", | |
"Traceback (most recent call last):\n", | |
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 36, in autodetect\n", | |
" from google.appengine.api import memcache\n", | |
"ModuleNotFoundError: No module named 'google.appengine'\n", | |
"\n", | |
"During handling of the above exception, another exception occurred:\n", | |
"\n", | |
"Traceback (most recent call last):\n", | |
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 33, in <module>\n", | |
" from oauth2client.contrib.locked_file import LockedFile\n", | |
"ModuleNotFoundError: No module named 'oauth2client'\n", | |
"\n", | |
"During handling of the above exception, another exception occurred:\n", | |
"\n", | |
"Traceback (most recent call last):\n", | |
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 37, in <module>\n", | |
" from oauth2client.locked_file import LockedFile\n", | |
"ModuleNotFoundError: No module named 'oauth2client'\n", | |
"\n", | |
"During handling of the above exception, another exception occurred:\n", | |
"\n", | |
"Traceback (most recent call last):\n", | |
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/__init__.py\", line 42, in autodetect\n", | |
" from . import file_cache\n", | |
" File \"/opt/conda/envs/Python36/lib/python3.6/site-packages/googleapiclient/discovery_cache/file_cache.py\", line 41, in <module>\n", | |
" \"file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\"\n", | |
"ImportError: file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth\n" | |
] | |
} | |
], | |
"source": [ | |
"dic_search = google_search(\"itaú\", my_api_key, my_cse_id, num=10)\n", | |
"df_search = pd.DataFrame(list(dic_search.items())[5][1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import types\n", | |
"import pandas as pd\n", | |
"from botocore.client import Config\n", | |
"import ibm_boto3\n", | |
"\n", | |
"def __iter__(self): return 0\n", | |
"\n", | |
"# @hidden_cell\n", | |
"# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.\n", | |
"# You might want to remove those credentials before you share the notebook.\n", | |
"client_09d659ed064a4c049d2d1b42b7c7853b = ibm_boto3.client(service_name='s3',\n", | |
" ibm_api_key_id='********************************_*',\n", | |
" ibm_auth_endpoint=\"https://iam.cloud.ibm.com/oidc/token\",\n", | |
" config=Config(signature_version='oauth'),\n", | |
" endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')\n", | |
"\n", | |
"body = client_*************************.get_object(Bucket='*********************-donotdelete-**-***********',Key='feature_news.csv')['Body']\n", | |
"# add missing __iter__ method, so pandas accepts body as file-like object\n", | |
"if not hasattr(body, \"__iter__\"): body.__iter__ = types.MethodType( __iter__, body )\n", | |
"\n", | |
"feature_news = pd.read_csv(body)\n", | |
"filter_link = df_search[~df_search['link'].isin(feature_news.Link)].reset_index(drop=True)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"## remove the quotes from the news\n", | |
"filter_itau_news = pd.DataFrame()\n", | |
"for i,x in enumerate(~filter_link['link'].str.contains('cotacoes')): ### removing the quotation page information\n", | |
" if x:\n", | |
" filter_itau_news= filter_itau_news.append(df_search[i:i+1]) \n", | |
"filter_itau_news = filter_itau_news[['link','snippet','title','pagemap']] ## filter cols in link, snippet and title\n", | |
"filter_itau_news = filter_itau_news.reset_index()\n", | |
"del filter_itau_news['index']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"itau_news = pd.DataFrame(columns= ['Date','Link','Text'])\n", | |
"for i,x in enumerate(filter_itau_news['link']):\n", | |
" res = requests.get(x)\n", | |
" response = TextResponse(res.url, body=res.text, encoding='utf-8')\n", | |
" lst = pd.DataFrame([[filter_itau_news['pagemap'][i]['metatags'][0]['article:published_time'][:10],filter_itau_news['link'][i], response.css(\".article-content p::text\").extract()]], columns= ['Date','Link','Text'])\n", | |
" itau_news = itau_news.append(lst)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package punkt to /home/dsxuser/nltk_data...\n", | |
"[nltk_data] Package punkt is already up-to-date!\n", | |
"[nltk_data] Downloading package stopwords to\n", | |
"[nltk_data] /home/dsxuser/nltk_data...\n", | |
"[nltk_data] Package stopwords is already up-to-date!\n" | |
] | |
} | |
], | |
"source": [ | |
"import nltk\n", | |
"nltk.download('punkt')\n", | |
"nltk.download('stopwords')\n", | |
"def strip_accents(sentence):\n", | |
" try:\n", | |
" sentence = unicode(sentence, 'utf-8')\n", | |
" except NameError: # unicode is a default on python 3 \n", | |
" pass\n", | |
"\n", | |
" sentence = unicodedata.normalize('NFD', sentence)\\\n", | |
" .encode('ascii', 'ignore')\\\n", | |
" .decode(\"utf-8\")\n", | |
" return str(sentence)\n", | |
"\n", | |
"def remove_punction(text):\n", | |
" no_punct = \"\".join(c for c in text if c not in string.punctuation)\n", | |
" return no_punct\n", | |
"\n", | |
"def remove_number(text):\n", | |
" no_number = ''.join([i for i in text if not i.isdigit()])\n", | |
" return no_number\n", | |
"\n", | |
"def lower_caser(text):\n", | |
" return text.lower()\n", | |
"\n", | |
"def remove_stopwords(text):\n", | |
" tokens = word_tokenize(text)\n", | |
" tokens = [t for t in tokens if not t in stopwords.words('portuguese')]\n", | |
" return \" \".join(map(str.strip, tokens))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"itau_news['Text'] = itau_news['Text'].apply(remove_number)\n", | |
"itau_news['Text'] = itau_news['Text'].apply(remove_stopwords)\n", | |
"itau_news['Text'] = itau_news['Text'].apply(lower_caser)\n", | |
"itau_news['Text'] = itau_news['Text'].apply(strip_accents)\n", | |
"itau_news['Text'] = itau_news['Text'].apply(remove_punction)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", | |
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n", | |
"/opt/conda/envs/Python36/lib/python3.6/site-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", | |
" \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" | |
] | |
} | |
], | |
"source": [ | |
"from io import BytesIO\n", | |
"model = load_model(project.get_file('RNNModel.tflearn'))\n", | |
"train_le = preprocessing.LabelEncoder() # creating a labelencoder" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Your data file was loaded into a botocore.response.StreamingBody object.\n", | |
"# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.\n", | |
"# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/\n", | |
"# pandas documentation: http://pandas.pydata.org/\n", | |
"streaming_body_2 = client_*****************************.get_object(Bucket='*********************-donotdelete-**-***********', Key='label_encoder.pkl')['Body']\n", | |
"# add missing __iter__ method so pandas accepts body as file-like object\n", | |
"if not hasattr(streaming_body_2, \"__iter__\"): streaming_body_2.__iter__ = types.MethodType( __iter__, streaming_body_2 ) \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.preprocessing.label module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.preprocessing. Anything that cannot be imported from sklearn.preprocessing is now part of the private API.\n", | |
" warnings.warn(message, FutureWarning)\n", | |
"/opt/conda/envs/Python36/lib/python3.6/site-packages/sklearn/externals/joblib/__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n", | |
" warnings.warn(msg, category=FutureWarning)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"LabelEncoder()" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from io import BytesIO\n", | |
"readrawdata = streaming_body_2.read()\n", | |
"label_encoder = pickle.load(BytesIO(readrawdata))\n", | |
"train_le.fit(label_encoder)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def RNN(Text):\n", | |
" test_texts = [Text]\n", | |
" tokenizer = Tokenizer(num_words=n_words)\n", | |
" tokenizer.fit_on_texts(test_texts)\n", | |
" test_sequences = tokenizer.texts_to_sequences(test_texts)\n", | |
" test_input = pad_sequences(test_sequences, maxlen=DIM_GLOVE)\n", | |
" test_predictions_probas = model.predict(test_input)\n", | |
" test_predictions = test_predictions_probas.argmax(axis=-1)\n", | |
" test_intent_predictions = train_le.inverse_transform(test_predictions)\n", | |
" df_probas = pd.DataFrame(test_predictions_probas)\n", | |
" df_intent = pd.DataFrame(test_intent_predictions)\n", | |
" frames = [df_probas, df_intent]\n", | |
" result = pd.concat(frames)\n", | |
" result = pd.concat([df_probas, df_intent], axis=1)\n", | |
" x = result.transpose()[0:104] \n", | |
" x.columns = ['probability']\n", | |
" x = pd.to_numeric(x['probability'][:3])\n", | |
" x = pd.DataFrame(x)\n", | |
" labels = pd.DataFrame(label_encoder)\n", | |
" labels = labels.reset_index()\n", | |
" labels.columns = ['Labels','number']\n", | |
" result_table= labels.join(x)\n", | |
" result_table = result_table.sort_values(by=['probability'], ascending = False)\n", | |
" result_table.head(5)\n", | |
" return result_table.head(5).reset_index(drop = True)['number'][0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"itau_news['Class'] = itau_news['Text'].apply(RNN)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"feature_news = feature_news.append(itau_news[['Date','Link','Class']]).reset_index(drop=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "NameError", | |
"evalue": "name 'project' is not defined", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-1-91a2b3e38107>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## Overwrite the csv\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mproject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeature_news\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfile_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'itau_news.csv'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[0;31mNameError\u001b[0m: name 'project' is not defined" | |
] | |
} | |
], | |
"source": [ | |
"## Overwrite the csv\n", | |
"project.save_data(data=feature_news.to_csv(),file_name='itau_news.csv',overwrite=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment