Linux常用命令

  • 查看进程 top
top - 18:18:44 up 5 days, 17:15,  1 user,  load average: 0.15, 0.46, 8.05
Tasks: 222 total,   1 running, 221 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.2 us,  0.3 sy,  0.0 ni, 99.6 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem :  1511808 total,    71260 free,  1238852 used,   201696 buff/cache
KiB Swap:  2097148 total,   923132 free,  1174016 used.   103924 avail Mem 
                         虚拟内存 常驻内存  共享内存    物理内存百分比                 命令名/命令行
   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND                                                                      
    47 root      39  19       0      0      0 S   5.0  0.0   1:39.64 khugepaged                                                                   
 17942 root      20   0 2940336  48644   1952 S   4.0  3.2 317:20.25 java                                                                         
  5268 root      20   0 5418828 309756   5416 S   2.3 20.5 393:51.19 java                                                                         
   741 root      20   0   90568     60     56 S   1.0  0.0  10:17.27 rngd                                                                         
 18052 root      20   0 2828980  29392   1956 S   1.0  1.9 129:52.15 java   
  • 查看每个物理CPU中core的个数(即核数) cat /proc/cpuinfo| grep "cpu cores"| uniq
cpu cores       : 4
  • 查看CPU信息(型号)cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c
 4  Intel(R) Core(TM) i5-4210H CPU @ 2.90GHz
  • 查看端口占用情况:lsof -i:端口号
COMMAND  PID  USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
mysqld  1561 mysql    4u  IPv6 154188      0t0  TCP wq2:mysql->wq3:60282 (ESTABLISHED)
mysqld  1561 mysql   21u  IPv6  29430      0t0  TCP *:mysql (LISTEN)
  • netstat -tunlp | grep 端口号
tcp6       0      0 :::3306                 :::*                    LISTEN      1561/mysqld
  • 杀死进程 kill -9 pid
kill -9 1561
  • 查看内存情况 free -lh
              total        used        free      shared  buff/cache   available
Mem:           1.4G        1.1G        107M         18M        281M        210M
Low:           1.4G        1.3G        107M
High:            0B          0B          0B
Swap:          2.0G        3.0M        2.0G
  • 查看硬盘情况 df -lh
文件系统                 容量  已用  可用 已用% 挂载点
devtmpfs                 722M     0  722M    0% /dev
tmpfs                    739M     0  739M    0% /dev/shm
tmpfs                    739M   19M  720M    3% /run
tmpfs                    739M     0  739M    0% /sys/fs/cgroup
/dev/mapper/centos-root   50G   18G   33G   35% /
/dev/sda1               1014M  237M  778M   24% /boot
tmpfs                    148M   12K  148M    1% /run/user/42
tmpfs                    148M     0  148M    0% /run/user/0
  • 查看文件夹大小 du -h 目标文件/文件夹

du -h -d1 shoppingmallonline
-d, –max-depth=N

[root@wq1 ~]# du -h --max-depth=1 shoppingmallonline
4.0K    shoppingmallonline/categories
146M    shoppingmallonline/custom
1.4G    shoppingmallonline/employee
1.2G    shoppingmallonline/order
398M    shoppingmallonline/orderdetail
416K    shoppingmallonline/products
3.1G    shoppingmallonline
  • 查看历史操作
[root@wq1 ~]# history 
   22  ll
   23  cd flume-1.9.0/
  • 集群间数据拷贝
scp -r /opt/flume-1.9.0 root@wq2:/opt/
  • 模糊查找某个文件 find / -name "*hadoop*"
    find 查找的那个目录下, -name 文件名/文件夹名
/root/hadoop-2.7.7.tar.gz
/root/hadoop_tmp
  • 搜索某个文件中的内容
#在该文件中查找出符合的字符并显示行号
grep -n 要查找的字符 要查找的文件 
#在该文件中显示出排除后的字符
grep -v 要查找的字符 要查找的文件 

vim命令相关

  • vim中快速查找某一字符串
    Shift + *
 最快的方式是让光标停留在想要查找的单词的任意一个字母上面, 然后输入Shift + * 
 即可快速选中该单词,并且可以通过 n  或  N 进行上一个或下一个的匹配。
  • 设置临时行号
:set nu
  • 跳转到某一行
    n代表要跳转的行数
:n

  • 查看某几行的数据
    sed -n 'a,bp' 目标文件查看a到b行的数据则需要b大于a就行,如果只查看a行的数据,那么b小于a即可
[root@wq2 ~]# sed -n '10,1p' w.text
10
[root@wq2 ~]# sed -n '10,13p' w.text 
10
11
12
13
  • 统计文件行数
[root@wq2 ~]# wc --help
用法:wc [选项]... [文件]...
 或:wc [选项]... --files0-from=F
  -c, --bytes            print the byte counts
  -m, --chars            print the character counts
  -l, --lines            print the newline counts
      --files0-from=文件        从指定文件读取以NUL 终止的名称,如果该文件被
                                        指定为"-"则从标准输入读文件名
  -L, --max-line-length 显示最长行的长度
  -w, --words                   显示单词计数
      --help            显示此帮助信息并退出
      --version         显示版本信息并退出
#查看行数,单词数
[root@wq2 ~]# wc -lw w.text 
14 14 w.text
  • 查看文件夹下文件个数或文件夹个数(或子文件夹下的)
#统计当前文件夹下文件的个数
ls -l |grep "^-"|wc -l
#统计当前文件夹下文件的个数,包括子文件夹里的
ls -lR|grep "^-"|wc -l
#统计文件夹下目录的个数,包括子文件夹里的
ls -lR|grep "^d"|wc -l
[root@wq1 opt]# ls -l|grep "^d"|wc -l 
9
  • 向文件追加内容 echo ‘specialwu’ >> ../test.txt
  • 追踪文件 tail -f test.txt打印最后10行再追踪、如果不断向这个文件追加数据通过该命令即可显示出来
  • 把文件一个移动到另一个地方 mv test.text /root.newtest.text
  • 解压文件tar -zxvf test.tar.gz
  • 压缩 tar zcvf test.txt

以上文件为临时所见,Linux的命令可分为以下十种类型

  1. 文件管理

– 查看文件所处位置

[root@wq1 ~]# pwd
/root
  1. 文档编辑
[root@wq1 ~]# vim slaves 
wq1
~
ESC :wq 退出编辑
  1. 文件传输
[root@wq1 ~]# scp w.text /root/tmp/
[root@wq1 ~]# cd tmp/
[root@wq1 tmp]# ll
总用量 4
-rw-r--r-- 1 root root 54 11月 18 17:24 w.text
[root@wq1 tmp]# 
  1. 磁盘管理
  2. 磁盘维护
  3. 网络通信
netstat -a  //显示详细的网络状况
netstat -nu //显示UDP端口号的使用情况
netstat -i      //显示网卡列表
netstat -g  //显示组播组的关系
netstat -s  //显示网络统计信息
netstat -l      //显示监听的套接口
netstat -ntlp   //查看当前所有tcp端口
netstat -ntulp |grep 80   //查看所有80端口使用情况
netstat -an | grep 3306   //查看所有3306端口使用情况
netstat -ntp | grep :3306 | wc  //查看某一端口的连接数量,比如3306端口
netstat -anp |grep 3306 //查看某一端口的连接客户端IP 比如3306端口
  1. 系统管理

– 查看系统信息

cat /proc/cpuinfo
rocessor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 60
model name      : Intel(R) Core(TM) i5-4210H CPU @ 2.90GHz
stepping        : 3
microcode       : 0x1c
cpu MHz         : 2893.369
cache size      : 3072 KB
physical id     : 0
siblings        : 4
core id         : 0
cpu cores       : 4
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
  1. 系统设置
  2. 备份压缩
  3. 设备管理

– 真实存在的设备

[root@wq1 ~]# fdisk -l 

磁盘 /dev/sda:107.4 GB, 107374182400 字节,209715200 个扇区
Units = 扇区 of 1 * 512 = 512 bytes
扇区大小(逻辑/物理):512 字节 / 512 字节
I/O 大小(最小/最佳):512 字节 / 512 字节
磁盘标签类型:dos
磁盘标识符:0x000e71a9

   设备 Boot      Start         End      Blocks   Id  System
/dev/sda1   *        2048     2099199     1048576   83  Linux
/dev/sda2         2099200   111165439    54533120   8e  Linux LVM
磁盘 /dev/mapper/centos-root:53.7 GB, 53687091200 字节,104857600 个扇区
Units = 扇区 of 1 * 512 = 512 bytes
扇区大小(逻辑/物理):512 字节 / 512 字节
I/O 大小(最小/最佳):512 字节 / 512 字节
磁盘 /dev/mapper/centos-swap:2147 MB, 2147483648 字节,4194304 个扇区
Units = 扇区 of 1 * 512 = 512 bytes
扇区大小(逻辑/物理):512 字节 / 512 字节
I/O 大小(最小/最佳):512 字节 / 512 字节

  • shell脚本
    用法 copyall.sh /home/mongodb /home/mongodb
#根据自己的机器名修改后再使用
#!/bin/bash
HOSTS='wq1 wq2 wq3'
for HOST in {HOSTS}
do
  echo "++正在复制1到HOST的2中..."
  scp -rq {1}{HOST}:${2}
  echo "--发送完成!"
done

大数据组件的常用命令

组件名操作命令
hdfs启动start-dfs.sh
本地上传文件hdfs dfs -put upload1.txt
格式化hdfs namenode -format
本地下载文件hdfs dfs -get /upload1.txt /dowtests
flume启动命令/bin flume-ng agent \--conf /opt/flume-1.9.0/conf \--conf-file /opt/flume-1.9.0/conf/my_conf/hdfs-hive.conf \--name a3 \-Dflume.root.logger=INFO,console
zookeeper启动命令/bin zkServer.sh start
kafka后台启动kafka-server-start.sh -daemon /opt/kafka/config/server.properties
查看topickafka-topics.sh --zookeeper wq1:2181 --list
创建topickafka-topics.sh --zookeeper bd0201:2181 --create --topic topic1 --partitions 3 --replication-factor 2
测试消费者kafka-console-consumer.sh --bootstrap-server bd0201:9092 --topic topic1
测试生产者kafka-console-producer.sh --broker-list bd0201:9092 --topic topic1
hive启动/bin hive
创建表create table testhive1(year String,id int,yearone int,c1 int,c2 int,c3 int,c4 int,c5 int,c6 int,c7 int) row format delimited fields terminated by ',';
映射hdfs到hiveload data inpath 'hdfs://wq1:9000/hdfs-hive/flume-hdfs.1603706948079' into table testhive1;

hdfs练习

File /profile/1.txt could only be replicated to 0 nodes instead of minReplication (=1).

Hadoop集群安全模式未关闭

org.xml.sax.SAXParseException; systemId: file:/opt/hadoop-2.7.7/etc/hadoop/core-site.xml;

有空格或者空行

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
    <!-- 配置默认的文件系统(配置NameNode服务启动的主机和端口) -->
    <name>fs.defaultFS</name>
<value>hdfs://wq1:9000/</value>
</property>
<property>
    <!-- 配置hdfs文件保存的目录,当前选择放在Hadoop的安装目录下 -->
    <name>hadoop.tmp.dir</name>
    <value>/opt/hadoop-2.7.7/data</value>
</property>
</configuration>

启动hdfs没有datanode节点

cat /opt/hadoop-2.7.7/data/dfs/data/current/VERSION
cat /opt/hadoop-2.7.7/data/dfs/name/current/VERSION
使data/current/VERSION中的clusterID与name/current/VERSION中的clusterID相同,直接删除dfs下所有文件在初始化也可

Cannot create file/upload1.txt.COPYING. Name node is in safe mode.

hadoop dfsadmin -safemode leave命令离开安全模式

服务器上上传文件hdfs dfs -put upload1.txt /
通过Java代码上传文件

public static void main(String[] args) throws Exception{
        boolean flag;
        // HDFS 的操作

        // HDFS 的API入口类是FileSystem
        // FileSystem 有多个实现类
        // FileSystem 可以根据URI兼容各种各样不同的文件系统

        // 0. 创建配置文件对象

        // 这个对象可以在程序运行时临时修改Hadoop的一些配置参数
        // 优先级  代码中  > resource的配置文件  > 集群
        Configuration conf = new Configuration();
        conf.set("dfs.replication", "1");

        // 1. 通过FileSystem创建  HDFS文件系统对象

        FileSystem fs = FileSystem.get(
                URI.create("hdfs://192.168.236.236:9000/"),
                conf,
                "root"
        );
        download(fs);
        // 3. 下载文件
        //download(fs);

        // 4. 创建目录
//         flag = fs.mkdirs(new Path("/2020-8-31"));
//        System.out.println(flag);

        // 5. 删除文件
//         flag = fs.delete(new Path("/2020-8-31"),true);
//        System.out.println(flag);

        // 6. 文件名更改
        /*flag = fs.rename(new Path("/1.txt"), new Path("/2.txt"));
        System.out.println(flag);*/

        // 7. 查看文件详情
        /*FileStatus[] fileStatuses = fs.listStatus(new Path("/"));
        for (FileStatus fileStatus : fileStatuses) {
            //TODO  按照Hadoop UI的格式展示文件

            // 8. 类型判断
            boolean isFile = fileStatus.isFile();
            boolean isDirectory = fileStatus.isDirectory();
        }*/
    }
 private static void upload(FileSystem fs) throws IOException {
        // 2. 上传文件 (覆盖)
        fs.copyFromLocalFile(
                new Path("C:\\Users\\Administrator\\Desktop\\up1"),
                new Path("/")
        );
    }

本地下载文件 hdfs dfs -get /upload1.txt /dowtests

    private static void download(FileSystem fs) throws IOException {
        fs.copyToLocalFile(
                new Path("/up1/1.txt"),
                new Path("C:\\Users\\Administrator\\Desktop\\up1")
        );
    }

flume练习

点击查看flume官方配置文档
0. 运行命令

# flume安装目录下的命令
bin/flume-ng agent \
# flume安装目录下的conf目录,为了让flume找到环境变量胡总和log4j之类的配置
--conf /opt/flume-1.9.0/conf \
# 指定我们自己编写的agent的配置文件
--conf-file /opt/flume-1.9.0/conf/my_conf/netcat_logger.conf \
# 指定agent的名字
--name a1 \
# 设置日志级别和输出方式
-Dflume.root.logger=INFO,console
  1. 从单个文件中实时读取数据存储到HDFS上
    >自定义的conf配置文件
# Name the components on this agent
# a1是我们自定义的agent的名字
# a1.sources  a1这个agent包含的多个source的名字
# 包含几个组件就写一个名字空格隔开
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
# 配置source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /root/web.log
# Describe the sink
# 配置sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://bd0201:9000/flume_test
a1.sinks.k1.hdfs.filePrefix = web-log #上传文件的前缀
a1.sinks.k1.hdfs.rollInterval = 0 #多久生成一个文件
a1.sinks.k1.hdfs.rollSize = 133169152 #每127M生成一个文件
a1.sinks.k1.hdfs.rollCount = 0 #文件的滚动与event无关
a1.sinks.k1.hdfs.writeFormat = text #写sequence文件的格式。包含:Text, Writable(默认)
a1.sinks.k1.hdfs.fileType = DataStream #包括:SequenceFile, DataStream,CompressedStream 使用DataStream时候,文件不会被压缩
# Use a channel which buffers events in memory
# 配置channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
# 将组件连接起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

flume-kafka

  • 获取单个文件的kafa
# 配置source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /root/cdhdata/categories/dt=26/categories-.1603704061065
  • 获取多个文件到kafka
# 配置source
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /var/log/flume/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /root/cdhdata/custom/.*txt
a1.sources.r1.headers.f1.headerKey1 = value1
a1.sources.r1.fileHeader = true
a1.sources.r1.maxBatchCount = 1000

配置上面的写法出现了错误

2021-03-28 19:24:18,770 (lifecycleSupervisor-1-0) [ERROR - org.apache.flume.source.taildir.ReliableTaildirEventReader.loadPositionFile(ReliableTaildirEventReader.java:147)] Failed loading positionFile: /root/opt/flume-1.9.0/data/taildir_position.json
java.io.EOFException: End of input at line 1 column 1
        at com.google.gson.stream.JsonReader.nextNonWhitespace(JsonReader.java:954)
        at com.google.gson.stream.JsonReader.nextValue(JsonReader.java:771)
        at com.google.gson.stream.JsonReader.peek(JsonReader.java:367)
        at com.google.gson.stream.JsonReader.expect(JsonReader.java:337)
        at com.google.gson.stream.JsonReader.beginArray(JsonReader.java:306)
        at org.apache.flume.source.taildir.ReliableTaildirEventReader.loadPositionFile(ReliableTaildirEventReader.java:111)
        at org.apache.flume.source.taildir.ReliableTaildirEventReader.<init>(ReliableTaildirEventReader.java:96)
        at org.apache.flume.source.taildir.ReliableTaildirEventReader.<init>(ReliableTaildirEventReader.java:49)
        at org.apache.flume.source.taildir.ReliableTaildirEventReaderBuilder.build(ReliableTaildirEventReader.java:355)
        at org.apache.flume.source.taildir.TaildirSource.start(TaildirSource.java:105)
        at org.apache.flume.source.PollableSourceRunner.start(PollableSourceRunner.java:71)
        at org.apache.flume.lifecycle.LifecycleSupervisorMonitorRunnable.run(LifecycleSupervisor.java:249)
        at java.util.concurrent.ExecutorsRunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
        at java.util.concurrent.ScheduledThreadPoolExecutorScheduledFutureTask.access301(ScheduledThreadPoolExecutor.java:180)
        at java.util.concurrent.ScheduledThreadPoolExecutorScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

但是相同的配置条件,在第二次数据传输量小时就没有出现上面这种错误。

kafka-flume-hdfs

启动

flume-ng agent \
--conf /opt/flume-1.9.0/conf \
--conf-file /opt/flume-1.9.0/conf/my_conf/hdfs-hive.conf \
--name a3 \
-Dflume.root.logger=INFO,console

自定义的配置文件hdfs-hive.conf

#new
# example.conf: A single-node Flume configuration
# Name the components on this agent
a3.sources = r1
a3.sinks = k1
a3.channels = c1
# Describe/configure the source
a3.sources.r1.type = org.apache.flume.source.kafka.KafkaSource
a3.sources.r1.batchSize = 5000 #积攒5000个event,flush到hdfs一次
a3.sources.r1.batchDurationMillis = 2000
a3.sources.r1.kafka.bootstrap.servers = wq1:9092,wq2:9092,wq3:9092
a3.sources.r1.kafka.consumer.auto.offset.reset = earliest
a3.sources.r1.kafka.topics = hdfs-hive
# Describe the sink
a3.sinks.k1.type = hdfs
a3.sinks.k1.hdfs.path = /flume-kafka-hdfs/
a3.sinks.k1.hdfs.filePrefix = flume-hdfs
a3.sinks.k1.hdfs.rollInterval = 0
a3.sinks.k1.hdfs.rollSize = 267386880
a3.sinks.k1.hdfs.rollCount = 0
a3.sinks.k1.hdfs.fileType = DataStream
a3.sinks.k1.hdfs.writeFormat = Text
# Use a channel which buffers events in memory
a3.channels.c1.type = file
a3.channels.c1.capacity = 10000
a3.channels.c1.byteCapacityBufferPercentage = 20
a3.channels.c1.transactionCapacity = 10000
a3.channels.c1.byteCapacity = 20000000
# Bind the source and sink to the channel
a3.sources.r1.channels = c1
a3.sinks.k1.channel = c1
  • 异常记录 (虽出现了该类情况,但数据还是传到了集群上)
org.apache.flume.ChannelFullException: The channel has reached it's capacity. This might be the result of a sink on the channel having too low of batch size, a downstream system running slower than normal, or that the channel capacity is just too low. [channel=c1]
        at org.apache.flume.channel.file.FileChannelFileBackedTransaction.doPut(FileChannel.java:505)
        at org.apache.flume.channel.BasicTransactionSemantics.put(BasicTransactionSemantics.java:93)
        at org.apache.flume.channel.BasicChannelSemantics.put(BasicChannelSemantics.java:80)
        at org.apache.flume.channel.ChannelProcessor.processEventBatch(ChannelProcessor.java:191)
        at org.apache.flume.source.kafka.KafkaSource.doProcess(KafkaSource.java:311)
        at org.apache.flume.source.AbstractPollableSource.process(AbstractPollableSource.java:60)
        at org.apache.flume.source.PollableSourceRunnerPollingRunner.run(PollableSourceRunner.java:133)
        at java.lang.Thread.run(Thread.java:748)

hive上的操作

创建一张表

create table testhive1(year String,id int,yearone int,c1 int,c2 int,c3 int,c4 int,c5 int,c6 int,c7 int) row format delimited fields terminated by ',';

将hdfs中的文件映射到表中

load data inpath 'hdfs://wq1:9000/hdfs-hive/flume-hdfs.1603706948079' into table testhive1;

通过sqoop把hdfs上的数据导入到MySQL中

sqoop-env.sh

export HADOOP_COMMON_HOME=/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/hadoop
export HADOOP_MAPRED_HOME=/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/hadoop
export HIVE_HOME=/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/hive
export ZOOCFGDIR=/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/zookeeper
ERROR tool.ExportTool: Encountered IOException running export job:
Export job failed!
        at org.apache.sqoop.mapreduce.ExportJobBase.runExport(ExportJobBase.java:445)
        at org.apache.sqoop.manager.SqlManager.exportTable(SqlManager.java:931)
        at org.apache.sqoop.tool.ExportTool.exportTable(ExportTool.java:80)
        at org.apache.sqoop.tool.ExportTool.run(ExportTool.java:99)
        at org.apache.sqoop.Sqoop.run(Sqoop.java:147)
        at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:76)
        at org.apache.sqoop.Sqoop.runSqoop(Sqoop.java:183)
        at org.apache.sqoop.Sqoop.runTool(Sqoop.java:234)
        at org.apache.sqoop.Sqoop.runTool(Sqoop.java:243)
        at org.apache.sqoop.Sqoop.main(Sqoop.java:252)

/opt/cloudera/parcels/CDH-6.3.2-1.cdh6.3.2.p0.1605554/lib/sqoop/
sqoop 安装目录的bin下启动

sqoop export \
--connect jdbc:mysql://192.168.3.51:3306/hive \
--username root \
--password 123456 \
--table zuihou \
--export-dir /user/hive/warehouse/ods_database.db/categories_ods_t/yt=2020/mt=10/dt=26/categories-.1603704061065 \
--fields-terminated-by '\t' \
--m 1 

安装配置HUE

下载地址http://archive.cloudera.com/cdh5/cdh/5/
建议离线下载好后再传到集群上169M 3.9.0版本
点击查看官方配置文档

所需依赖

$ yum install -y gcc libxml2-devel libxslt-devel cyrus-sasl-devel mysql-devel python-devel python-setuptools python-simplejson sqlite-devel ant gmp-devel cyrus-sasl-plain cyrus-sasl-devel cyrus-sasl-gssapi libffi-devel openldap-devel

sudo chown -R hue:hue /usr/share/hue否则页面访问不到

Hadoop上的配置文件
/opt/hadoop-2.7.7/etc/hadoop 这三个配置文件都加上对应依赖

hdfs-site.xml

<property> 
  <name> dfs.webhdfs.enabled </ name> 
  <value> true </ value> 
</ property>

core-site.xml

<property>
 <name>hadoop.proxyuser.hue.hosts</name>
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.hue.groups</name>
  <value>*</value>
</property>

httpfs-site.xml

<property>
  <name>httpfs.proxyuser.hue.hosts</name>
  <value>*</value>
</property>
<property>
  <name>httpfs.proxyuser.hue.groups</name>
  <value>*</value>
</property>

Couldn’t write lextab module <module ‘slimit.lextab’ from ‘/usr/local/lib/python2.7/dist-packages/slimit/lextab.pyc’

解决办法: 在/usr/share/hue/build/env/bin执行pip uninstall ply
pip install ply3.4

修改hue配置文件

目前该页面可以显示可以查看hdfs,MySQL,但是写hivesql语句时database is locked
vim /usr/share/hue/desktop/conf/hue.ini

912行hdfs配置
 [[hdfs_clusters]]
    # HA support by using HttpFs

    [[[default]]]
      # Enter the filesystem uri
      fs_defaultfs=hdfs://192.168.236.236:8020

      # NameNode logical name.
      ## logical_name=192.168.236.236

      # Use WebHdfs/HttpFs as the communication mechanism.
      # Domain should be the NameNode or HttpFs host.
      # Default port is 14000 for HttpFs.
      webhdfs_url=http://192.168.236.236:50070/webhdfs/v1

      # Change this if your HDFS cluster is Kerberos-secured
      ## security_enabled=false

      # In secure mode (HTTPS), if SSL certificates from YARN Rest APIs
      # have to be verified against certificate authority
      ## ssl_cert_ca_verify=True

      # Directory of the Hadoop configuration
      hadoop_conf_dir=/opt/hadoop-2.7.7/etc/hadoop
     #这两条为自定义的部分
     hadoop_hdfs_home=/opt/hadoop-2.7.7
     hadoop_bin=/opt/hadoop-2.7.7/bin


     1026行hive配置
     [beeswax]
  # Host where HiveServer2 is running.
  # If Kerberos security is enabled, use fully-qualified domain name (FQDN).
  hive_server_host=192.168.236.238

  # Port where HiveServer2 Thrift server runs on.
  hive_server_port=10000

  # Hive configuration directory, where hive-site.xml is located
  hive_conf_dir=/etc/hive-1.2.2/conf

  # Timeout in seconds for thrift calls to Hive service
  server_conn_timeout=120

  # Choose whether to use the old GetLog() thrift call from before Hive 0.14 to retrieve the logs.
  # If false, use the FetchResults() thrift call from Hive 1.0 or more instead.
  ## use_get_log_api=false

  # Limit the number of partitions that can be listed.
  ## list_partitions_limit=10000

  # The maximum number of partitions that will be included in the SELECT * LIMIT sample query for partitioned tables.
  ## query_partitions_limit=10

  # A limit to the number of rows that can be downloaded from a query before it is truncated.
  # A value of -1 means there will be no limit.
  ## download_row_limit=100000
     [metastore]
  # Flag to turn on the new version of the create table wizard.
  enable_new_create_table=true

  # Flag to force all metadata calls (e.g. list tables, table or column details...) to happen via HiveServer2 if available instead of Impala.
  ## force_hs2_metadata=false


1604行MySQL修改
    [[[mysql]]]
      # Name to show in the UI.
      nice_name="wq2mysql"

      # For MySQL and PostgreSQL, name is the name of the database.
      # For Oracle, Name is instance of the Oracle server. For express edition
      # this is 'xe' by default.
      name=hmysql

      # Database backend to use. This can be:
      # 1. mysql
      # 2. postgresql
      # 3. oracle
      engine=mysql

      # IP or hostname of the database to connect to.
      host=192.168.236.237

      # Port the database server is listening to. Defaults are:
      # 1. MySQL: 3306
      # 2. PostgreSQL: 5432
      # 3. Oracle Express Edition: 1521
      port=3306

      # Username to authenticate with when connecting to the database.
      user=specialwu

在hive启动时出现Resources are low on NN. Please add or free up more resources then turn off safe mode manually.

存储空间不够

you are a Hue admin but not a HDFS superuser, “hdfs” or part of HDFS supergroup, “supergroup

上面三个配置文件未修改,或者是修改后得重新启动

hive连接10000端口失败

vim /opt/hive-1.2.2/conf/hive-env.sh中export HIVE_SERVER2_THRIFT_PORT=<10000>
hive安装目录下执行下面语句 启动hive的metastore以及hiveserver2服务

nohup bin/hive --service metastore &
nohup bin/hive --service hiveserver2 &
启动hue /user/share/hue/build/env/bin supervisor

各节点jps(Java Virtual Machine Process Status Tool)

wq1
3139 NameNode
4547 DataNode
5268 Kafka
18052 NodeManager
17942 ResourceManager
19814 Jps
2697 SecondaryNameNode
4940 QuorumPeerMain

wq2
3972 DataNode
19654 Jps
4344 QuorumPeerMain
2537 SecondaryNameNode
16570 NodeManager

wq3
42499 DataNode
44661 QuorumPeerMain
115591 NodeManager
114505 RunJar
28556 SecondaryNameNode
114591 RunJar
117470 Jps