连接到Hive:
beeline -u "jdbc:hive2://master:10000" -n root-- 查看所有数据库
SHOW DATABASES;
-- 创建数据库
CREATE DATABASE student_db;
-- 使用数据库
USE student_db;
-- 查看当前使用的数据库
SELECT current_database();
-- 查看数据库详细信息
DESCRIBE DATABASE student_db;-- 创建学生信息表(指定逗号分隔符)
CREATE TABLE students (
id INT,
name STRING,
age INT,
major STRING,
gpa DOUBLE,
email STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- 创建课程表
CREATE TABLE courses (
course_id STRING,
course_name STRING,
credits INT,
instructor STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
-- 创建成绩表
CREATE TABLE grades (
student_id INT,
course_id STRING,
grade STRING,
semester STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;-- 查看所有表
SHOW TABLES;
-- 查看表结构
DESCRIBE students;
-- 查看表的详细信息
DESCRIBE FORMATTED students;
-- 查看创建表的SQL语句
SHOW CREATE TABLE students;-- 重命名表
ALTER TABLE grades RENAME TO student_grades;
-- 查看修改后的表结构
DESCRIBE student_grades;
SHOW TABLES;USE student_db;
-- 向学生表插入数据
INSERT INTO students VALUES
(1, '张三', 20, '计算机科学', 3.8, 'zhangsan@email.com'),
(2, '李四', 21, '数学', 3.6, 'lisi@email.com'),
(3, '王五', 19, '物理', 3.9, 'wangwu@email.com'),
(4, '赵六', 22, '化学', 3.5, 'zhaoliu@email.com'),
(5, '孙七', 20, '计算机科学', 3.7, 'sunqi@email.com');
-- 向课程表插入数据
INSERT INTO courses VALUES
('CS101', '程序设计基础', 3, '张老师'),
('MATH201', '高等数学', 4, '李老师'),
('PHYS101', '大学物理', 3, '王老师'),
('CHEM101', '普通化学', 3, '赵老师'),
('CS201', '数据结构', 3, '陈老师');
-- 向成绩表插入数据
INSERT INTO student_grades VALUES
(1, 'CS101', 'A', '2023春季'),
(1, 'MATH201', 'B+', '2023春季'),
(2, 'CS101', 'B', '2023春季'),
(2, 'MATH201', 'A', '2023春季'),
(3, 'PHYS101', 'A', '2023春季'),
(3, 'MATH201', 'B+', '2023春季'),
(4, 'CHEM101', 'A-', '2023春季'),
(5, 'CS101', 'A-', '2023春季'),
(5, 'CS201', 'B+', '2023秋季');
-- 验证数据插入
SELECT COUNT(*) as student_count FROM students;
SELECT COUNT(*) as course_count FROM courses;
SELECT COUNT(*) as grade_count FROM student_grades;# 如果要使用文件加载,需要先退出beeline
!quit
# 创建正确格式的数据文件
mkdir -p /tmp/student_data
# 创建学生数据文件(确保格式正确)
cat > /tmp/student_data/students.txt << EOF
6,周八,21,英语,3.4,zhouba@email.com
7,吴九,20,历史,3.6,wujiu@email.com
8,郑十,19,计算机科学,3.8,zhengshi@email.com
EOF
# 上传到HDFS
hdfs dfs -mkdir -p /user/student_data
hdfs dfs -put /tmp/student_data/students.txt /user/student_data/
# 重新连接beeline
beeline -u "jdbc:hive2://master:10000" -n root-- 加载文件数据到表
USE student_db;
LOAD DATA INPATH '/user/student_data/students.txt' INTO TABLE students;
-- 验证加载结果
SELECT * FROM students WHERE id > 5;-- 导出查询结果到HDFS
INSERT OVERWRITE DIRECTORY '/user/output/high_gpa_students'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT * FROM students WHERE gpa >= 3.7;
-- 导出到本地文件系统
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/student_output'
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
SELECT name, major, gpa FROM students ORDER BY gpa DESC;-- 查看所有学生
SELECT * FROM students;
-- 查询特定列
SELECT name, age, major FROM students;
-- 条件查询
SELECT * FROM students WHERE age >= 21;
SELECT name, gpa FROM students WHERE major = '计算机科学';
SELECT * FROM students WHERE gpa > 3.6 AND age < 21;
-- 模糊查询
SELECT * FROM students WHERE name LIKE '张%';
SELECT * FROM students WHERE email LIKE '%@email.com';
-- 范围查询
SELECT * FROM students WHERE gpa BETWEEN 3.5 AND 3.8;
SELECT * FROM students WHERE major IN ('计算机科学', '数学');-- 按GPA降序排列
SELECT name, major, gpa FROM students ORDER BY gpa DESC;
-- 按年龄升序,GPA降序排列
SELECT name, age, gpa FROM students ORDER BY age ASC, gpa DESC;
-- 查询前3名学生
SELECT name, gpa FROM students ORDER BY gpa DESC LIMIT 3;
-- 查询第4-6名学生
SELECT name, gpa FROM students ORDER BY gpa DESC LIMIT 3 OFFSET 3;-- 按专业统计学生数量
SELECT major, COUNT(*) as student_count FROM students GROUP BY major;
-- 按专业统计平均GPA
SELECT major, COUNT(*) as count, AVG(gpa) as avg_gpa, ROUND(AVG(gpa), 2) as rounded_avg_gpa
FROM students GROUP BY major;
-- 按年龄分组,显示每个年龄段的学生数和最高GPA
SELECT age, COUNT(*) as count, MAX(gpa) as max_gpa
FROM students GROUP BY age;
-- 使用HAVING过滤分组结果
SELECT major, COUNT(*) as count, AVG(gpa) as avg_gpa
FROM students
GROUP BY major
HAVING COUNT(*) >= 2;
-- 统计各个成绩等级的人数
SELECT grade, COUNT(*) as count
FROM student_grades
GROUP BY grade
ORDER BY count DESC;-- 基本统计信息
SELECT
COUNT(*) as total_students,
MAX(gpa) as highest_gpa,
MIN(gpa) as lowest_gpa,
AVG(gpa) as average_gpa,
ROUND(AVG(gpa), 2) as rounded_avg_gpa,
SUM(age) as total_age
FROM students;
-- 按条件统计
SELECT
COUNT(CASE WHEN gpa >= 3.7 THEN 1 END) as excellent_count,
COUNT(CASE WHEN gpa >= 3.5 AND gpa < 3.7 THEN 1 END) as good_count,
COUNT(CASE WHEN gpa < 3.5 THEN 1 END) as average_count
FROM students;