Skip to content

Instantly share code, notes, and snippets.

@kangvcar
Created June 4, 2025 14:27
Show Gist options
  • Select an option

  • Save kangvcar/7991e4cd634e5095c7dda5d452d7d9c9 to your computer and use it in GitHub Desktop.

Select an option

Save kangvcar/7991e4cd634e5095c7dda5d452d7d9c9 to your computer and use it in GitHub Desktop.

Hive基础操作

准备工作

连接到Hive:

beeline -u "jdbc:hive2://master:10000" -n root

案例1:数据库和表的基本操作

1.1 数据库操作

-- 查看所有数据库
SHOW DATABASES;

-- 创建数据库
CREATE DATABASE student_db;

-- 使用数据库
USE student_db;

-- 查看当前使用的数据库
SELECT current_database();

-- 查看数据库详细信息
DESCRIBE DATABASE student_db;

1.2 创建表(重要:指定字段分隔符)

-- 创建学生信息表(指定逗号分隔符)
CREATE TABLE students (
    id INT,
    name STRING,
    age INT,
    major STRING,
    gpa DOUBLE,
    email STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

-- 创建课程表
CREATE TABLE courses (
    course_id STRING,
    course_name STRING,
    credits INT,
    instructor STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

-- 创建成绩表
CREATE TABLE grades (
    student_id INT,
    course_id STRING,
    grade STRING,
    semester STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

1.3 查看表信息

-- 查看所有表
SHOW TABLES;

-- 查看表结构
DESCRIBE students;

-- 查看表的详细信息
DESCRIBE FORMATTED students;

-- 查看创建表的SQL语句
SHOW CREATE TABLE students;

1.4 修改表结构

-- 重命名表
ALTER TABLE grades RENAME TO student_grades;

-- 查看修改后的表结构
DESCRIBE student_grades;
SHOW TABLES;

案例2:数据导入和导出

2.1 方法一:使用INSERT直接插入数据(推荐)

USE student_db;

-- 向学生表插入数据
INSERT INTO students VALUES 
(1, '张三', 20, '计算机科学', 3.8, 'zhangsan@email.com'),
(2, '李四', 21, '数学', 3.6, 'lisi@email.com'),
(3, '王五', 19, '物理', 3.9, 'wangwu@email.com'),
(4, '赵六', 22, '化学', 3.5, 'zhaoliu@email.com'),
(5, '孙七', 20, '计算机科学', 3.7, 'sunqi@email.com');

-- 向课程表插入数据
INSERT INTO courses VALUES 
('CS101', '程序设计基础', 3, '张老师'),
('MATH201', '高等数学', 4, '李老师'),
('PHYS101', '大学物理', 3, '王老师'),
('CHEM101', '普通化学', 3, '赵老师'),
('CS201', '数据结构', 3, '陈老师');

-- 向成绩表插入数据
INSERT INTO student_grades VALUES 
(1, 'CS101', 'A', '2023春季'),
(1, 'MATH201', 'B+', '2023春季'),
(2, 'CS101', 'B', '2023春季'),
(2, 'MATH201', 'A', '2023春季'),
(3, 'PHYS101', 'A', '2023春季'),
(3, 'MATH201', 'B+', '2023春季'),
(4, 'CHEM101', 'A-', '2023春季'),
(5, 'CS101', 'A-', '2023春季'),
(5, 'CS201', 'B+', '2023秋季');

-- 验证数据插入
SELECT COUNT(*) as student_count FROM students;
SELECT COUNT(*) as course_count FROM courses;
SELECT COUNT(*) as grade_count FROM student_grades;

2.2 方法二:从文件加载数据

# 如果要使用文件加载,需要先退出beeline
!quit

# 创建正确格式的数据文件
mkdir -p /tmp/student_data

# 创建学生数据文件(确保格式正确)
cat > /tmp/student_data/students.txt << EOF
6,周八,21,英语,3.4,zhouba@email.com
7,吴九,20,历史,3.6,wujiu@email.com
8,郑十,19,计算机科学,3.8,zhengshi@email.com
EOF

# 上传到HDFS
hdfs dfs -mkdir -p /user/student_data
hdfs dfs -put /tmp/student_data/students.txt /user/student_data/

# 重新连接beeline
beeline -u "jdbc:hive2://master:10000" -n root
-- 加载文件数据到表
USE student_db;
LOAD DATA INPATH '/user/student_data/students.txt' INTO TABLE students;

-- 验证加载结果
SELECT * FROM students WHERE id > 5;

2.3 数据导出

-- 导出查询结果到HDFS
INSERT OVERWRITE DIRECTORY '/user/output/high_gpa_students'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT * FROM students WHERE gpa >= 3.7;

-- 导出到本地文件系统
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/student_output'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT name, major, gpa FROM students ORDER BY gpa DESC;

案例3:数据查询

3.1 基本查询

-- 查看所有学生
SELECT * FROM students;

-- 查询特定列
SELECT name, age, major FROM students;

-- 条件查询
SELECT * FROM students WHERE age >= 21;
SELECT name, gpa FROM students WHERE major = '计算机科学';
SELECT * FROM students WHERE gpa > 3.6 AND age < 21;

-- 模糊查询
SELECT * FROM students WHERE name LIKE '张%';
SELECT * FROM students WHERE email LIKE '%@email.com';

-- 范围查询
SELECT * FROM students WHERE gpa BETWEEN 3.5 AND 3.8;
SELECT * FROM students WHERE major IN ('计算机科学', '数学');

3.2 排序和限制

-- 按GPA降序排列
SELECT name, major, gpa FROM students ORDER BY gpa DESC;

-- 按年龄升序,GPA降序排列
SELECT name, age, gpa FROM students ORDER BY age ASC, gpa DESC;

-- 查询前3名学生
SELECT name, gpa FROM students ORDER BY gpa DESC LIMIT 3;

-- 查询第4-6名学生
SELECT name, gpa FROM students ORDER BY gpa DESC LIMIT 3 OFFSET 3;

3.3 分组统计

-- 按专业统计学生数量
SELECT major, COUNT(*) as student_count FROM students GROUP BY major;

-- 按专业统计平均GPA
SELECT major, COUNT(*) as count, AVG(gpa) as avg_gpa, ROUND(AVG(gpa), 2) as rounded_avg_gpa
FROM students GROUP BY major;

-- 按年龄分组,显示每个年龄段的学生数和最高GPA
SELECT age, COUNT(*) as count, MAX(gpa) as max_gpa 
FROM students GROUP BY age;

-- 使用HAVING过滤分组结果
SELECT major, COUNT(*) as count, AVG(gpa) as avg_gpa 
FROM students 
GROUP BY major 
HAVING COUNT(*) >= 2;

-- 统计各个成绩等级的人数
SELECT grade, COUNT(*) as count 
FROM student_grades 
GROUP BY grade 
ORDER BY count DESC;

3.4 聚合函数应用

-- 基本统计信息
SELECT 
    COUNT(*) as total_students,
    MAX(gpa) as highest_gpa,
    MIN(gpa) as lowest_gpa,
    AVG(gpa) as average_gpa,
    ROUND(AVG(gpa), 2) as rounded_avg_gpa,
    SUM(age) as total_age
FROM students;

-- 按条件统计
SELECT 
    COUNT(CASE WHEN gpa >= 3.7 THEN 1 END) as excellent_count,
    COUNT(CASE WHEN gpa >= 3.5 AND gpa < 3.7 THEN 1 END) as good_count,
    COUNT(CASE WHEN gpa < 3.5 THEN 1 END) as average_count
FROM students;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment