Linux Tutorial Step by Step Guide: August 2018

Cloudera Distribution Apache Hadoop single Node Installation Step by Step guide Centos 7. Completely manual process.

# yum install java

# java --version
openjdk version "1.8.0_181"
OpenJDK Runtime Environment (build 1.8.0_181-b13)
OpenJDK 64-Bit Server VM (build 25.181-b13, mixed mode)

# useradd -g hadoop hadoop

# passwd hadoop
Changing password for user hadoop.
New password:
BAD PASSWORD: The password contains the user name in some form
Retype new password:
passwd: all authentication tokens updated successfully.

# su - hadoop

$ ssh-keygen -t rsa
Generating public/private rsa key pair.
Enter file in which to save the key (/home/hadoop/.ssh/id_rsa):
Created directory '/home/hadoop/.ssh'.
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /home/hadoop/.ssh/id_rsa.
Your public key has been saved in /home/hadoop/.ssh/id_rsa.pub.
The key fingerprint is:
SHA256:n4d58SIkG9mWM8P6QS8YjxCNxbkhaYKhCNJwFH8a3A8 hadoop@hyd-hyd-hadoop-test.arkit.co.in
The key's randomart image is:
+---[RSA 2048]----+
|++=+ o.. |
|+ooo..+++ |
|o +oE..o |
| + +.+ . |
| . . S @ . |
| . # O o |
| = X = . |
| . * . |
| . |
+----[SHA256]-----+

$ cat ~/.ssh/id_rsa.pub
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDkk39QsAbOJwMVy7AenkzYg6nR9YDhVaqwUna9bR2Tu8XsHUYy+x9TiWtrj+3awAb8uuMHqV3Q+aW2Pe3FEwJvWRv0j3tqhYD9UgM4blMAEdHnLhj0bO+AT+y3yhxCcb+7+WEgv1B0pz9HaD8D4NDiYIizUktcetYT3SswS3vZxJrSDr5BsepnHQDMcAgF4xpZwXIbJaoNuQIlO93+L/2OiIqlsjb2BKiBf5EoNTFZt9lJVvCyiV29ujH2eb6zktyIn+D7Z/591Esk1zEcIwIjsjvTJFpb9HXYHJqQhAYC0VrHNRnzRRyGJguzGkFUK8ppTbI7P+W29KBDbpfg7T8z hadoop@hyd-hadoop-test.arkit.co.in

$ cd ~/.ssh/
$ ls
id_rsa id_rsa.pub
$ touch authorized_keys
$ vi authorized_keys
### Paste Above id_rsa.pub content ###
$ chmod 0600 authorized_keys

Downloaded Java fro m oracle site and installed
# yum localinstall jdk-8u172-linux-x64.rpm

Installed:
jdk1.8.x86_64 2000:1.8.0_172-fcs

Complete!

# export JAVA_HOME=/usr/java/jdk1.8.0_172-amd64/

# sudo yum clean all; sudo yum install hadoop-yarn-resourcemanager
# sudo yum clean all; sudo yum install hadoop-hdfs-namenode
# sudo yum clean all; sudo yum install hadoop-hdfs-secondarynamenode
# sudo yum clean all; sudo yum install hadoop-yarn-nodemanager hadoop-hdfs-datanode hadoop-mapreduce -y
# sudo yum clean all; sudo yum install hadoop-mapreduce-historyserver hadoop-yarn-proxyserver -y
# sudo yum clean all; sudo yum install hadoop-client

--Disable Firewall
systemctl stop firewalld/iptables/ip6tables
systemctl disable firewalld/iptables/ip6tables

service stop iptables
service stop ip6tables

chkconfig iptables off
chkconfig ip6tables off

--Disable SELinux 
Edit file /etc/selinux/config

SELINUX=disabled

Note: Reboot is required to take effect

root@hyd-hadoop-test:~# cp -r /etc/hadoop/conf.empty/ /etc/hadoop/conf.my_cluster

root@hyd-hadoop-test:~# sudo alternatives --install /etc/hadoop/conf hadoop-conf /etc/hadoop/conf.my_cluster 50

root@hyd-hadoop-test:~# sudo alternatives --set hadoop-conf /etc/hadoop/conf.my_cluster

root@hyd-hadoop-test:~# sudo alternatives --display hadoop-conf
hadoop-conf - status is manual.
link currently points to /etc/hadoop/conf.my_cluster
/etc/hadoop/conf.empty - priority 10
/etc/hadoop/conf.impala - priority 5
/etc/hadoop/conf.my_cluster - priority 50
Current `best' version is /etc/hadoop/conf.my_cluster.

Edit the configuration file and add
# cat /etc/hadoop/conf.my_cluster/core-site.xml

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://hyd-hadoop-test:8020</value>
<description>NameNode URI</description>
</property>

<property>
<name>fs.trash.interval</name>
<value>30</value>
</property>

<property>
<name>fs.trash.checkpoint.interval</name>
<value>31</value>
</property>

</configuration>

# cat /etc/hadoop/conf.my_cluster/hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///data/hadoop-hdfs/cache/hdfs/dfs/name</value>
</property>

<property>
<name>dfs.datanode.name.dir</name>
<value>file:///data/hadoop-hdfs/cache/hdfs/dfs/data</value>
</property>

<property>
<name>dfs.permissions.superusergroup</name>
<value>hadoop</value>
</property>

<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>

</configuration>

root@hyd-hadoop-test:~# mkdir -p /data/hadoop-hdfs/cache/hdfs/dfs/name

root@hyd-hadoop-test:~# mkdir -p /data/hadoop-hdfs/cache/hdfs/dfs/data

root@hyd-hadoop-test:~# chown -R hdfs:hdfs /data/hadoop-hdfs/cache/hdfs/dfs/data/

root@hyd-hadoop-test:~# chown -R hdfs:hdfs /data/hadoop-hdfs/cache/hdfs/dfs/name/

root@hyd-hadoop-test:~# chmod 775 /data/hadoop-hdfs/cache/hdfs/dfs/name

root@hyd-hadoop-test:~# chmod 775 /data/hadoop-hdfs/cache/hdfs/dfs/data/

format namenode
# sudo -u hdfs hdfs namenode -format

18/08/14 12:53:13 INFO namenode.FSImage: Allocated new BlockPoolId: BP-90611475-10.103.2.104-1534231393025
18/08/14 12:53:13 INFO common.Storage: Storage directory /data/hadoop-hdfs/cache/hdfs/dfs/name has been successfully formatted.
18/08/14 12:53:13 INFO namenode.FSImageFormatProtobuf: Saving image file /data/hadoop-hdfs/cache/hdfs/dfs/name/current/fsimage.ckpt_0000000000000000000 using no compression
18/08/14 12:53:13 INFO namenode.FSImageFormatProtobuf: Image file /data/hadoop-hdfs/cache/hdfs/dfs/name/current/fsimage.ckpt_0000000000000000000 of size 317 bytes saved in 0 seconds.
18/08/14 12:53:13 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0
18/08/14 12:53:13 INFO util.ExitUtil: Exiting with status 0
18/08/14 12:53:13 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at hyd-hadoop-test/192.168.2.5
************************************************************/

Start HDFS
root@hyd-hadoop-test:~# for x in `cd /etc/init.d ; ls hadoop-hdfs-*` ; do sudo service $x start ; done
starting datanode, logging to /var/log/hadoop-hdfs/hadoop-hdfs-datanode-hyd-hyd-hadoop-test.arkit.co.in.out
Started Hadoop datanode (hadoop-hdfs-datanode): [ OK ]
starting namenode, logging to /var/log/hadoop-hdfs/hadoop-hdfs-namenode-hyd-hyd-hadoop-test.arkit.co.in.out
Started Hadoop namenode: [ OK ]
starting secondarynamenode, logging to /var/log/hadoop-hdfs/hadoop-hdfs-secondarynamenode-hyd-hyd-hadoop-test.arkit.co.in.out
Started Hadoop secondarynamenode: [ OK ]

# sudo -u hdfs hadoop fs -mkdir /tmp

# sudo -u hdfs hadoop fs -chmod -R 1777 /tmp

sudo -u hdfs hadoop fs -ls /

root@hyd-hadoop-test:~# cat /etc/hadoop/conf.my_cluster/mapred-site.xml

<configuration>
<property>
<name>mapreduce.fremework.name</name>
<value>yarn</value>
</property>

<property>
<name>mapreduce.jobhistory.address</name>
<value>hyd-hadoop-test.arkit.co.in:10020</value>
</property>

<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hyd-hadoop-test.arkit.co.in:19888</value>
</property>

</configuration>

root@hyd-hadoop-test:~# cat /etc/hadoop/conf.my_cluster/yarn-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>

<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>

<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>

<property>
<description>List of directories to store localized files in.</description>
<name>yarn.nodemanager.local-dirs</name>
<value>file:///var/lib/hadoop-yarn/cache/${user.name}/nm-local-dir</value>
</property>

<property>
<description>Where to store container logs.</description>
<name>yarn.nodemanager.log-dirs</name>
<value>file:///var/log/hadoop-yarn/containers</value>
</property>

<property>
<description>Where to aggregate logs to.</description>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>hdfs://var/log/hadoop-yarn/apps</value>
</property>

<property>
<description>Classpath for typical applications.</description>
<name>yarn.application.classpath</name>
<value>
$HADOOP_CONF_DIR,
$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,
$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,
$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,
$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*
</value>
</property>

<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>hyd-hadoop-test.arkit.co.in:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>hyd-hadoop-test.arkit.co.in:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>hyd-hadoop-test.arkit.co.in:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>hyd-hadoop-test.arkit.co.in:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.adress</name>
<value>hyd-hadoop-test.arkit.co.in:8088</value>
</property>
<property>
<name>yarn.log.aggregation-enable</name>
<value>true</value>
</property>

</configuration>

Adding Environment Variables for Haddop component
root@hyd-hadoop-test:/usr/lib/hadoop-mapreduce# export HADOOP_COMMON_HOME=/usr/lib/hadoop

root@hyd-hadoop-test:/usr/lib/hadoop-mapreduce# export HADOOP_HDFS_HOME=/usr/lib/hadoop/hadoop-hdfs

root@hyd-hadoop-test:/usr/lib/hadoop-mapreduce# export HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce

root@hyd-hadoop-test:/usr/lib/hadoop-mapreduce# export HADOOP_YARN_HOME=/usr/lib/hadoop-yarn

root@hyd-hadoop-test:/usr/lib/hadoop-mapreduce# export HADOOP_CONF_DIR=/usr/lib/hadoop/etc/hadoop

root@hyd-hadoop-test:~# mkdir -p /data/yarn/local
--date/time--> 08/14/18 15:02:35
root@hyd-hadoop-test:~# mkdir -p /data/yarn/logs

root@hyd-hadoop-test:~# chown -R yarn:yarn /data/yarn/local
--date/time--> 08/14/18 15:03:11
root@hyd-hadoop-test:~# chown -R yarn:yarn /data/yarn/logs/

root@hyd-hadoop-test:~# sudo service hadoop-yarn-resourcemanager start
starting resourcemanager, logging to /var/log/hadoop-yarn/yarn-yarn-resourcemanager-hyd-hadoop-test.arkit.co.in.out
Started Hadoop resourcemanager: [ OK ]
--date/time--> 08/14/18 15:11:56
root@hyd-hadoop-test:~# sudo service hadoop-yarn-nodemanager start
starting nodemanager, logging to /var/log/hadoop-yarn/yarn-yarn-nodemanager-hyd-hadoop-test.arkit.co.in.out
Started Hadoop nodemanager: [ OK ]
--date/time--> 08/14/18 15:12:10
root@hyd-hadoop-test:~# sudo service hadoop-mapreduce-historyserver start
starting historyserver, logging to /var/log/hadoop-mapreduce/mapred-mapred-historyserver-hyd-hadoop-test.arkit.co.in.out
18/08/14 15:12:17 INFO hs.JobHistoryServer: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting JobHistoryServer
STARTUP_MSG: user = mapred
STARTUP_MSG: host = hyd-hadoop-test/192.168.2.5
STARTUP_MSG: args = []
STARTUP_MSG: version = 2.6.0-cdh5.15.0
STARTUP_MSG: build = http://github.com/cloudera/hadoop -r e3cb23a1cb2b89d074171b44e71f207c3d6ffa50; compiled by 'jenkins' on 2018-05-24T11:19Z
STARTUP_MSG: java = 1.8.0_181
Started Hadoop historyserver: [ OK ]

root@hyd-hadoop-test:~# sudo -u hdfs hadoop fs -mkdir -p /user/hdfs

root@hyd-hadoop-test:~# sudo -u hdfs hadoop fs -mkdir -p /user/yarn

root@hyd-hadoop-test:~# sudo -u hdfs hadoop fs -chown yarn:yarn /user/yarn

## Install all CDH components
yum install hive hive-metastore hive-server2 hive-hbase whirr sqoop2-server sqoop2-client sqoop spark-core spark-master spark-worker spark-history-server spark-python hbase-solr-indexer hbase-solr-doc solr-mapreduce solr-crunch solr-server pig mahout hadoop-kms hadoop-kms-server impala* hue hadoop-httpfs oozie

Hue Configuration
# vi /etc/hadoop/conf.my_cluster/core-site.xml

<property>
<name>hadoop.proxyuser.hue.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.groups</name>
<value>*</value>
</property>

oozie configuraion
# vi /etc/oozie/conf/oozie-site.xml
<property>
<name>oozie.service.ProxyUserService.proxyuser.hue.hosts</name>
<value>*</value>
</property>

<property>
<name>oozie.service.ProxyUserService.proxyuser.hue.groups</name>
<value>*</value>
</property>

Add Server IP Address abd Port Number
# vi /etc/hue/conf.empty/hue.ini

## Webserver listens on this address and port
http_host=192.168.2.5
http_port=8888

Beeswax configuration
[beeswax]

# Host where HiveServer2 is running.
# If Kerberos security is enabled, use fully-qualified domain name (FQDN).
hive_server_host=192.168.2.5

# Port where HiveServer2 Thrift server runs on.
hive_server_port=10000

# Hive configuration directory, where hive-site.xml is located
hive_conf_dir=/usr/lib/hive/conf/

# Timeout in seconds for thrift calls to Hive service
## server_conn_timeout=120

[impala]
# Host of the Impala Server (one of the Impalad)
server_host=192.168.2.5

# Port of the Impala Server
server_port=21050

# Kerberos principal
## impala_principal=impala/hostname.foo.com

# Turn on/off impersonation mechanism when talking to Impala
impersonation_enabled=False

### Database configuration
[[[mysql]]]
name=mysqldb
engine=mysql
host=localhost
port=3306
user=root
password=temp
options={}

To Connect Database Installing Mariadb in Centos 7
yum install mariadb* -y

# mysql -u root -ptemp

MariaDB [(none)]> create database mysqldb;
Query OK, 1 row affected (0.00 sec)

MariaDB [(none)]> show databases;
+--------------------+
| Database |
+--------------------+
| information_schema |
| mysql |
| mysqldb |
| performance_schema |
+--------------------+
4 rows in set (0.00 sec)

MariaDB [(none)]> exit
Bye

root@hyd-hadoop-test:/usr/lib/hue/pids# /etc/init.d/hive-server2 status
Hive Server2 is not running [FAILED]
--date/time--> 08/14/18 18:04:16
root@hyd-hadoop-test:/usr/lib/hue/pids# /etc/init.d/hive-server2 restart
Stopped Hive Server2: [ OK ]
Started Hive Server2 (hive-server2): [ OK ]
--date/time--> 08/14/18 18:04:25
root@hyd-hadoop-test:/usr/lib/hue/pids# systemctl enable hive-server2
hive-server2.service is not a native service, redirecting to /sbin/chkconfig.
Executing /sbin/chkconfig hive-server2 on

Linux Tutorial Step by Step Guide

Downloads

Linux Tutorial For Beginners - RHCSA Certification Complete Tutorial

CDH Manual installation in single Node

Youtube Videos

Tutorial