|
# Create Directories |
|
$baseDir = "C:/hadoop/hadoop-3.2.2/data" |
|
New-Item -ItemType Directory -Path "$baseDir" |
|
New-Item -ItemType Directory -Path "$baseDir/dfs/data" -Force |
|
New-Item -ItemType Directory -Path "$baseDir/dfs/namespace_logs" -Force |
|
|
|
# Fix JAVA_HOME |
|
Set-Location "C:\Progra~1\Java\" |
|
$javaFolder = Get-ChildItem | Where-Object { $_.Name -match "jdk1.8.0_*" } |
|
$javaPath = $javaFolder.FullName |
|
$a = New-Object -ComObject Scripting.FileSystemObject |
|
$javaPath = $a.GetFolder($javaPath) |
|
$javaPath = $javaPath.ShortPath |
|
$hadoopEnvPath = "C:\hadoop\hadoop-3.2.2\etc\hadoop\hadoop-env.cmd" |
|
(Get-Content $hadoopEnvPath) | ForEach-Object { |
|
if ($_ -like 'set JAVA_HOME=*') { |
|
$_ -replace 'set JAVA_HOME=.*', "set JAVA_HOME=$javaPath" |
|
} |
|
else { |
|
$_ |
|
} |
|
} | Set-Content $hadoopEnvPath |
|
|
|
# Create environment variables |
|
[Environment]::SetEnvironmentVariable("HADOOP_HOME", "C:\hadoop\hadoop-3.2.2", "User") |
|
$hadoopClasspath = & "$env:HADOOP_HOME\bin\hadoop.cmd" classpath |
|
[Environment]::SetEnvironmentVariable("HADOOP_CP", $hadoopClasspath, "User") |
|
[Environment]::SetEnvironmentVariable("HDFS_LOC", "hdfs://localhost:19000", "User") |
|
$path = [Environment]::GetEnvironmentVariable("Path", "User") |
|
$newPaths = "$env:HADOOP_HOME\bin;$env:HADOOP_HOME\sbin" |
|
$newPath = "$newPaths;$path" |
|
[Environment]::SetEnvironmentVariable("Path", $newPath, "User") |
|
|
|
# 1. core-site.xml |
|
$path = "C:\hadoop\hadoop-3.2.2\etc\hadoop\core-site.xml" |
|
$xml = [xml](Get-Content $path) |
|
$property = $xml.CreateElement("property") |
|
$propertyname = $xml.CreateElement("name") |
|
$propertyvalue = $xml.CreateElement("value") |
|
$propertyname.InnerText = "fs.default.name" |
|
$propertyvalue.InnerText = "hdfs://0.0.0.0:19000" |
|
$property.AppendChild($propertyname) |
|
$property.AppendChild($propertyvalue) |
|
$xml.configuration.AppendChild($property) |
|
$xml.Save($path) |
|
|
|
# 2. hdfs-site.xml |
|
$path = "C:\hadoop\hadoop-3.2.2\etc\hadoop\hdfs-site.xml" |
|
$xml = [xml](Get-Content $path) |
|
$replicationProperty = $xml.CreateElement("property") |
|
$replicationPropertyName = $xml.CreateElement("name") |
|
$replicationPropertyValue = $xml.CreateElement("value") |
|
$replicationPropertyName.InnerText = "dfs.replication" |
|
$replicationPropertyValue.InnerText = "1" |
|
$replicationProperty.AppendChild($replicationPropertyName) |
|
$replicationProperty.AppendChild($replicationPropertyValue) |
|
$xml.configuration.AppendChild($replicationProperty) |
|
$namenodeDirProperty = $xml.CreateElement("property") |
|
$namenodeDirPropertyName = $xml.CreateElement("name") |
|
$namenodeDirPropertyValue = $xml.CreateElement("value") |
|
$namenodeDirPropertyName.InnerText = "dfs.namenode.name.dir" |
|
$namenodeDirPropertyValue.InnerText = "$baseDir/dfs/namespace_logs" |
|
$namenodeDirProperty.AppendChild($namenodeDirPropertyName) |
|
$namenodeDirProperty.AppendChild($namenodeDirPropertyValue) |
|
$xml.configuration.AppendChild($namenodeDirProperty) |
|
$datanodeDirProperty = $xml.CreateElement("property") |
|
$datanodeDirPropertyName = $xml.CreateElement("name") |
|
$datanodeDirPropertyValue = $xml.CreateElement("value") |
|
$datanodeDirPropertyName.InnerText = "dfs.datanode.data.dir" |
|
$datanodeDirPropertyValue.InnerText = "$baseDir/dfs/data" |
|
$datanodeDirProperty.AppendChild($datanodeDirPropertyName) |
|
$datanodeDirProperty.AppendChild($datanodeDirPropertyValue) |
|
$xml.configuration.AppendChild($datanodeDirProperty) |
|
$xml.Save($path) |
|
|
|
# 3. mapred-site.xml |
|
$path = "C:\hadoop\hadoop-3.2.2\etc\hadoop\mapred-site.xml" |
|
$xml = [xml](Get-Content $path) |
|
$frameworkNameProperty = $xml.CreateElement("property") |
|
$frameworkNamePropertyName = $xml.CreateElement("name") |
|
$frameworkNamePropertyValue = $xml.CreateElement("value") |
|
$frameworkNamePropertyName.InnerText = "mapreduce.framework.name" |
|
$frameworkNamePropertyValue.InnerText = "yarn" |
|
$frameworkNameProperty.AppendChild($frameworkNamePropertyName) |
|
$frameworkNameProperty.AppendChild($frameworkNamePropertyValue) |
|
$xml.configuration.AppendChild($frameworkNameProperty) |
|
$classpathProperty = $xml.CreateElement("property") |
|
$classpathPropertyName = $xml.CreateElement("name") |
|
$classpathPropertyValue = $xml.CreateElement("value") |
|
$classpathPropertyName.InnerText = "mapreduce.application.classpath" |
|
$classpathPropertyValue.InnerText = "%HADOOP_HOME%/share/hadoop/mapreduce/*,%HADOOP_HOME%/share/hadoop/mapreduce/lib/*,%HADOOP_HOME%/share/hadoop/common/*,%HADOOP_HOME%/share/hadoop/common/lib/*,%HADOOP_HOME%/share/hadoop/yarn/*,%HADOOP_HOME%/share/hadoop/yarn/lib/*,%HADOOP_HOME%/share/hadoop/hdfs/*,%HADOOP_HOME%/share/hadoop/hdfs/lib/*" |
|
$classpathProperty.AppendChild($classpathPropertyName) |
|
$classpathProperty.AppendChild($classpathPropertyValue) |
|
$xml.configuration.AppendChild($classpathProperty) |
|
$xml.Save($path) |
|
|
|
# 4. yarn-site.xml |
|
$path = "C:\hadoop\hadoop-3.2.2\etc\hadoop\yarn-site.xml" |
|
$xml = [xml](Get-Content $path) |
|
$resourceManagerProperty = $xml.CreateElement("property") |
|
$resourceManagerPropertyName = $xml.CreateElement("name") |
|
$resourceManagerPropertyValue = $xml.CreateElement("value") |
|
$resourceManagerPropertyName.InnerText = "yarn.resourcemanager.hostname" |
|
$resourceManagerPropertyValue.InnerText = "localhost" |
|
$resourceManagerProperty.AppendChild($resourceManagerPropertyName) |
|
$resourceManagerProperty.AppendChild($resourceManagerPropertyValue) |
|
$xml.configuration.AppendChild($resourceManagerProperty) |
|
$auxServicesProperty = $xml.CreateElement("property") |
|
$auxServicesPropertyName = $xml.CreateElement("name") |
|
$auxServicesPropertyValue = $xml.CreateElement("value") |
|
$auxServicesPropertyName.InnerText = "yarn.nodemanager.aux-services" |
|
$auxServicesPropertyValue.InnerText = "mapreduce_shuffle" |
|
$auxServicesProperty.AppendChild($auxServicesPropertyName) |
|
$auxServicesProperty.AppendChild($auxServicesPropertyValue) |
|
$xml.configuration.AppendChild($auxServicesProperty) |
|
$envWhitelistProperty = $xml.CreateElement("property") |
|
$envWhitelistPropertyName = $xml.CreateElement("name") |
|
$envWhitelistPropertyValue = $xml.CreateElement("value") |
|
$envWhitelistPropertyName.InnerText = "yarn.nodemanager.env-whitelist" |
|
$envWhitelistPropertyValue.InnerText = "JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME" |
|
$envWhitelistProperty.AppendChild($envWhitelistPropertyName) |
|
$envWhitelistProperty.AppendChild($envWhitelistPropertyValue) |
|
$xml.configuration.AppendChild($envWhitelistProperty) |
|
$xml.Save($path) |
|
|
|
# Create Files |
|
$testFilePath = "C:\hadoop\test.txt" |
|
$mapperFilePath = "C:\hadoop\mapper.py" |
|
$reducerFilePath = "C:\hadoop\reducer.py" |
|
|
|
# 1. Create test.txt file |
|
Set-Content -Path $testFilePath -Value @" |
|
this is the test file if you are seeing it in HDFS test folder it means the file uploaded successfully. If you are seeing it in output folder it means the streaming job ran successfully |
|
"@ -Force |
|
|
|
# 2. Create mapper.py file |
|
Set-Content -Path $mapperFilePath -Value @" |
|
import sys |
|
for line in sys.stdin: |
|
line = line.strip() |
|
words = line.split() |
|
for word in words: |
|
print('%s\t%s' % (word, 1)) |
|
"@ -Force |
|
|
|
# 3. Create reducer.py file |
|
Set-Content -Path $reducerFilePath -Value @" |
|
from operator import itemgetter |
|
import sys |
|
current_word = None |
|
current_count = 0 |
|
word = None |
|
for line in sys.stdin: |
|
line = line.strip() |
|
word, count = line.split('\t', 1) |
|
try: |
|
count = int(count) |
|
except ValueError: |
|
continue |
|
if current_word == word: |
|
current_count += count |
|
else: |
|
if current_word: |
|
print('%s\t%s' % (current_word, current_count)) |
|
current_count = count |
|
current_word = word |
|
|
|
if current_word == word: |
|
print('%s\t%s' % (current_word, current_count)) |
|
"@ -Force |