##############################################################
Email : [email protected]
##############################################################
-
Enable WSL on Windows: https://www.makeuseof.com/enable-windows-subsystem-for-linux/
-
Install Ubuntu 20.04 from Microsoft Store
-
Setup user as once Ubuntu is installed, it prompts to setup
-
Execuet following commands: a. sudo apt update
b. apt list --upgradable
c. sudo apt install default-jdk
d. java -version
e. sudo adduser hadoopuser hadoopuser/hadoopuser
f. Setup SSH To get the ssh server working properly, you must uninstall and then reinstall it using the following command:
sudo apt remove openssh-server
sudo apt install openssh-server
sudo vi /etc/ssh/sshd_config
- Change PasswordAuthentication to yes
- Add your login user to the bottom of the file by using this command: AllowUsers hadoopuser
Check the status of the ssh service:
service ssh status
If you see: * sshd is not running
Then run this command:
sudo service ssh start
If you see: * sshd is running
Then run this command:
sudo service ssh --full-restart
g. Switch user to hadoopuser: su - hadoopuser
h. Enabling Passwordless SSH for a Hadoop User ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys
ssh localhost
i. Set JAVA_HOME readlink -f /usr/bin/javac
vi .bashrc
# Add this line/output of above command as JAVA_HOME
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
j. Download & setup Hadoop wget https://archive.apache.org/dist/hadoop/common/hadoop-3.2.3/hadoop-3.2.3.tar.gz
tar xzf hadoop-3.2.3.tar.gz
k. Add User to the Sudoers Group # Switch to root user sudo usermod -aG sudo hadoopuser
l. Switch back to hadoopuser su - hadoopuser
su - whoami
m. Configure Hadoop Environment A) ~/.bashrc # Hadoop Variables export HADOOP_HOME=/home/hadoopuser/hadoop-3.2.3 export HADOOP_INSTALL=$HADOOP_HOME export HADOOP_MAPRED_HOME=$HADOOP_HOME export HADOOP_COMMON_HOME=$HADOOP_HOME export HADOOP_HDFS_HOME=$HADOOP_HOME export YARN_HOME=$HADOOP_HOME export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native" export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
B) hadoop-env.sh
sudo vi $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
C) core-site.xml
sudo vi $HADOOP_HOME/etc/hadoop/core-site.xml
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hadoop/tmpdata</value>
</property>
<property>
<name>fs.default.name</name>
<value>hdfs://127.0.0.1:9000</value>
</property>
</configuration>
D) hdfs-site.xml
sudo vi $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<configuration>
<property>
<name>dfs.data.dir</name>
<value>/home/hadoopuser/dfsdata/namenode</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>/home/hadoopuser/dfsdata/datanode</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
E) mapred-site-xml
sudo vi $HADOOP_HOME/etc/hadoop/mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
F) yarn-site.xml
sudo vi $HADOOP_HOME/etc/hadoop/yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>127.0.0.1</value>
</property>
<property>
<name>yarn.acl.enable</name>
<value>0</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PERPEND_DISTCACHE,HADOOP_YARN HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
n. Create dir sudo mkdir -p /home/hadoop/tmpdata sudo chmod 777 /home/hadoop/tmpdata
- Format HDFS hdfs namenode -format
o. Start HDFS cd $HADOOP_HOME/sbin/
./start-dfs.sh
./start-yarn.sh
Check the HDFS & YARN are running with commend: jps
hadoopuser@Ira-Laptop:~/hadoop-3.2.3/sbin$ jps
3633 NameNode
4309 NodeManager
3733 DataNode
3881 SecondaryNameNode
4665 Jps
4202 ResourceManager
hadoopuser@Ira-Laptop:~/hadoop-3.2.3/sbin$
# Check on URL
http://localhost:9870/dfshealth.html#tab-overview
p. SPARK setup wget https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
tar xzf spark-3.0.1-bin-hadoop3.2.tgz
Add below in .bashrc
# Spark Setup
export SPARK_HOME=/home/hadoopuser/spark-3.0.1-bin-hadoop3.2
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
export PYTHONPATH=$PYTHONPAH:/usr/bin/python3
export PYSPARK_PYTHON=python3
q. Setup PySpark notebook integrating with HDFS on WSL ubuntu - Exit from the hadoopuser
sudo apt install python3-pip
- Install below python packages as hadoopuser
sudo apt install jupyter-core
pip install jupyter
pip install pyspark==3.3.1
pip install pandas==1.5.3 # Latest pandas version is not compatible for iteritem
pip install findspark
pip install fsspec
pip install pyarrow
pip install openpyxl
- Add below in the .bashrc file of root user [not hadoopuser]
alias jupyter-lab="/home/yogesh/.local/bin/jupyter-lab"
- Add below in the .bashrc file of hadoopuser
export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob`
r. Launch jupyter notebook from the WSL command prompt hadoopuser@Ira-Laptop:~$ jupyter-lab --no-browser
This will give the log with the URL. Copy the URL in the browser
s. Here are the sample commands to copy files from host machine to HDFS: hadoopuser@Ira-Laptop:/mnt/c/bds$ hdfs dfs -copyFromLocal taxi_zone_lookup.csv /bds/ hadoopuser@Ira-Laptop:/mnt/c/bds$ hdfs dfs -copyFromLocal yellow_tripdata_2020-06.xlsx /bds/
t. Here's the python packages installed as hadoopuser
hadoopuser@Ira-Laptop:$ pip freeze
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
async-lru==2.0.4
attrs==23.1.0
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.12.2
bleach==6.0.0
blinker==1.4
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
comm==0.1.4
command-not-found==0.3
cryptography==3.4.8
dbus-python==1.2.18
debugpy==1.6.7.post1
decorator==5.1.1
defusedxml==0.7.1
distro==1.7.0
distro-info===1.1build1
et-xmlfile==1.1.0
exceptiongroup==1.1.3
executing==1.2.0
fastjsonschema==2.18.0
findspark==2.0.1
fqdn==1.5.1
fsspec==2023.6.0
httplib2==0.20.2
idna==3.4
importlib-metadata==4.6.4
ipykernel==6.25.1
ipython==8.14.0
ipython-genutils==0.2.0
ipywidgets==8.1.0
isoduration==20.11.0
jedi==0.19.0
jeepney==0.7.1
Jinja2==3.1.2
json5==0.9.14
jsonpointer==2.4
jsonschema==4.19.0
jsonschema-specifications==2023.7.1
jupyter==1.0.0
jupyter-console==6.6.3
jupyter-events==0.7.0
jupyter-lsp==2.2.0
jupyter_client==8.3.0
jupyter_core==5.3.1
jupyter_server==2.7.2
jupyter_server_terminals==0.4.4
jupyterlab==4.0.5
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.8
jupyterlab_server==2.24.0
keyring==23.5.0
launchpadlib==1.10.16
lazr.restfulclient==0.14.4
lazr.uri==1.0.6
MarkupSafe==2.1.3
matplotlib-inline==0.1.6
mistune==3.0.1
more-itertools==8.10.0
nbclient==0.8.0
nbconvert==7.7.4
nbformat==5.9.2
nest-asyncio==1.5.7
netifaces==0.11.0
notebook==7.0.2
notebook_shim==0.2.3
numpy==1.25.2
oauthlib==3.2.0
openpyxl==3.1.2
overrides==7.4.0
packaging==23.1
pandas==1.5.3
pandocfilters==1.5.0
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
platformdirs==3.10.0
prometheus-client==0.17.1
prompt-toolkit==3.0.39
psutil==5.9.5
ptyprocess==0.7.0
pure-eval==0.2.2
py4j==0.10.9.5
pyarrow==13.0.0
pycparser==2.21
Pygments==2.16.1
PyGObject==3.42.1
PyJWT==2.3.0
pyparsing==2.4.7
pyspark==3.3.1
python-apt==2.4.0+ubuntu1
python-dateutil==2.8.2
python-json-logger==2.0.7
pytz==2023.3
PyYAML==5.4.1
pyzmq==25.1.1
qtconsole==5.4.3
QtPy==2.3.1
referencing==0.30.2
requests==2.31.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rpds-py==0.9.2
SecretStorage==3.3.1
Send2Trash==1.8.2
six==1.16.0
sniffio==1.3.0
soupsieve==2.4.1
ssh-import-id==5.11
stack-data==0.6.2
systemd-python==234
terminado==0.17.1
tinycss2==1.2.1
tomli==2.0.1
tornado==6.3.3
traitlets==5.9.0
typing_extensions==4.7.1
tzdata==2023.3
ubuntu-advantage-tools==8001
ufw==0.36.1
unattended-upgrades==0.1
uri-template==1.3.0
urllib3==2.0.4
wadllib==1.3.6
wcwidth==0.2.6
webcolors==1.13
webencodings==0.5.1
websocket-client==1.6.2
widgetsnbextension==4.0.8
zipp==1.0.0
hadoopuser@Ira-Laptop:$