with pyspark.SparkContext("local", "PySparkWordCount") as sc: file_path = "example.csv"
data = sc.textFile(file_path)
data = sc.textFile(file_path).cache
#Documentación | |
#https://conda.io/docs/user-guide/tasks/manage-environments.html | |
#Listar los enviroments | |
conda info --envs | |
conda env list | |
#Crear ambiente especificando la version de python | |
conda create -n myenv python=3.4 |
# http://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-crawler-pyspark-extensions-python-intro.html | |
pip install awscli --upgrade --user | |
source ~/.bash_profile | |
aws --version |
# Listar en databricks el archivo | |
%fs ls dbfs:/databricks-datasets/online_retail/data-001/ | |
# En databricks ya esta inicializado el objeto SparkSession pero sino se puede inicializar asi | |
from pyspark.sql import SparkSession | |
spark = SparkSession.builder.master("local").appName("Word Count").config("spark.some.config.option", "some-value").getOrCreate() | |
# Leer CSV a un RDD | |
path = "dbfs:/databricks-datasets/online_retail/data-001/data.csv" | |
data = spark.read.csv(path) |
Tener en cuenta que se puede leer con OpenCSVSerde http://docs.aws.amazon.com/athena/latest/ug/csv.html
# Mostrar | |
defaults write com.apple.finder AppleShowAllFiles YES | |
killall Finder /System/Library/CoreServices/Finder.app | |
# Ocultar | |
defaults write com.apple.finder AppleShowAllFiles NO | |
killall Finder /System/Library/CoreServices/Finder.app |
Estos pasos son para instalar Dremio en Ubuntu | |
### Primero instalamos jre | |
sudo apt-get update | |
sudo apt-get install default-jre | |
https://www.digitalocean.com/community/tutorials/instalar-java-en-ubuntu-con-apt-get-es | |
#LINKS DE INSTALACION |
# Deshabilitar el firewall | |
sudo ufw status verbose | |
sudo ufw disable | |
# Instalar Apache | |
sudo apt -get install apache2 | |
# Buscar paquetes actuales de mysql | |
sudo apt-cache search mysql | grep mysql | more |