课设-Fin版本_随笔

课设-Fin版本 hive数据分析 1、创建表

CREATE TABLE  law ( 
ip bigint, 
area int,
ie_proxy string, 
ie_type string ,
userid string,
clientid string,
time_stamp bigint,
time_format string,
pagepath string,
ymd int,
visiturl string,
page_type string,
host string,
page_title string,
page_title_type int,
page_title_name string,
title_keyword string,
in_port string,
in_url string,
search_keyword string,
source string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
STORED AS TEXTFILE;

导入数据

load data inpath '/lawdata.csv' overwrite into table law;

2、统计表law中网页类型

#查询表law中总记录数
select count(1) from law;

select substring(page_type,2,3) as page_type,
count(*) as count_num,
round((count(*)/837502.0)*100,4) as weights 
from law group by substring(page_type,2,3) 
order by count_num desc limit 7;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
----创建动态分区表law_part
CREATE TABLE  law_part ( 
ip bigint, 
area int,
ie_proxy string, 
ie_type string ,
userid string,
clientid string,
time_stamp bigint,
time_format string,
pagepath string,
ymd int,
visiturl string,
page_type string,
host string,
page_title string,
page_title_type int,
page_title_name string,
title_keyword string,
in_port string,
in_url string,
search_keyword string,
source string)
partitioned by(page_type_part string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;

-----导入数据到分区
INSERT OVERWRITE TABLE law_part PARTITION (page_type_part) 
SELECt *,substring(page_type,2,3) FROM law ;

-----统计101网页类型的子类型
select substring(page_type,2,6) as page_type,count(*) as count_num,round((count(*)/411665.0)*100,4) as weights from law_part where page_type_part=101 group by substring(page_type,2,6) order by count_num desc;

----统计访问网页中带有“？”的记录总数
select count(*) as num from law_part where visiturl like '%?%';

----统计带有?的所有记录中，各网页类型所占比例
select substring(page_type,2,6) as page_type,count(*),round((count(*)*100)/65357,4) as weights from law where visiturl like '%?%' group by substring(page_type,2,6) order by  weights desc limit 5;

3、点击次数分析

----用户个数统计
select count(distinct(userid)) from law;

----创建点击次数分区表，分区字段click_part
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrict;
create table law_click (
user_num int,
user_weights double,
record_weights double
) partitioned by (click_part string)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
 STORED AS TEXTFILE;

----导入分区数据到分区表law_click
INSERT OVERWRITE TABLE law_click PARTITION (click_part) 
select count(click_num) as count,round(count(click_num)*100/347609.0,2),round((count(click_num)*click_num)*100/837502.0,2),click_num from (
select count(userid) as click_num  from law group by userid
) tmp_table group by click_num order by count desc limit 8;

----点击次数分析
set hive.exec.reducers.max=1;
select click_num, count(click_num) as count,round(count(click_num)*100/347609.0,2),round((count(click_num)*click_num)*100/837502.0,2)from (
select count(userid) as click_num  from law group by userid
) tmp_table group by click_num order by count desc limit 8;

----浏览一次用户行为分析
select page_type,count(page_type) as count,round((count(page_type)*100)/227732.0,4) from 
(select substring(a.page_type,2,6) as page_type from law a,(select userid from law group by userid having(count(userid)=1)) b
where a.userid = b.userid) c group by page_type order by count desc limit 5;

----点击一次用户浏览网页统计分析
select a.visiturl,count(*) as count from law a,(select userid from law group by userid having(count(userid)=1)) b where a.userid = b.userid group by a.visiturl order by count desc limit 7;

-----访问量前十大网页排名
select a.visiturl,count(*) as count from law a where a.visiturl like '%.html%' group by a.visiturl order by count desc limit 10;

pig数据清洗过程

law = load '/lawdate.csv' using PigStorage (',');
law_filter = filter law by  matches '.*\.html' 
and not ( matches '.*midques_.*') 
and not ( matches '.*browse.*') 
and not ( matches '.*\?.*') 
and not ( matches '.*法律快车-律师助手.*')
and (SUBSTRINg(,0,3) == '101' or SUBSTRINg(,0,3) == '107');
//law_dist_field = foreach law_filter generate ,,;
law_code_field = foreach law_filter generate ,,,;
set job.name 'store_result';
//user,url,timestamp
//store law_dist_field into '/law_cleaned' using PigStorage(',');
//timestamp,user,url,urlType
store law_code_field into '/law_code_cleaned' using PigStorage(',');

Spark数据编码归约

// 加载数据，原始数据：timestamp,user,url,urlType
val rawDataPath = "/law_code_cleaned/*"
val dataAl1 = sc.textFile(rawDataPath).map{x => val fields=x.split(","); (fields(0),fields(1),fields(2),fields(3))}
// 排序去重后用户、URL数据
val userUrl = dataAl1.map(x => (x._2,x._3))
val alluserList = userUrl.map(data=>data._1).distinct.sortBy(x=>x)
val allUrlList = userUrl.map(data=>data._2).distinct.sortBy(x=>x)
// 构造用户、URL编码
val allUserIdList = alluserList.zipWithIndex.map(data=>(data._1,data._2.toString))
val allUrlIdList = allUrlList.zipWithIndex.map(data=>(data._1,data._2.toString))
// 保存编码数据
allUserIdList.map(x => x._1 +","+x._2).repartition(1).saveAsTextFile("/law_userlist_Coding")
allUrlIdList.map(x => x._1 +","+x._2).repartition(1).saveAsTextFile("/law_urllist_Coding")
// 替换原始数据
val replacedDataAl1 = dataAl1.map(x => (x._2,(x._1,x._3,x._4))).
join(allUserIdList).map(x => (x._2._1._2,(x._2._1._1,x._2._2,x._2._1._3))).join(allUrlIdList).map(x => (x._2._1._1,x._2._1._2,x._2._2,x._2._1._3))
// 保存编码后数据
replacedDataAl1.saveAsTextFile ("/law_data_Coding")

  import org.apache.spark.SparkContext
  import org.apache.spark.rdd.RDD

  def initRatingwithNum(sc:SparkContext, inputDir: String, splitVisitedNumArray:
  Array[Int],splitVisitedNumValues : Array[Int]): RDD[(Int, Int,Int)]=
{
    val dataRaw = sc.textFile(inputDir).map { x => val fields = x.slice(1,x.size-1)split(",");
      (fields(1).toInt, fields(2).toInt)}
    val dataCount = dataRaw.map(x => (x, 1)).reduceByKey((x,y)=>(x+y)).
      filter(x=> x._2 >= splitVisitedNumArray(0))
    val spliterPointwithValue = (-1 +: splitVisitedNumArray).zip(splitVisitedNumArray:+ Integer.MAX_VALUE).
      zip(splitVisitedNumValues)
    dataCount.map{x=>(x._1._1,x._1._2,spliterPointwithValue.filter(y => y._1._1 >= x._2 && y._1._2 > x._2)(0)._2)}
}
    //不一样需要改地址
    val data = initRatingwithNum(sc,"/law_data_Coding/*",Array(2,5,10,20,20000),Array(0,2,3,4,5,6))
    data.map(x =>x._1+","+x._2+","+x._3)
    data.saveAsTextFile ("/law_data_Guiyue")

ALS模型建立

# -*- coding: utf-8 -*-
# @Time : 2022/1/8 21:01
# @Author : ..
# @Email : [email protected]
# @File : Test3.py
# @Software: PyCharm
from pyspark import SparkContext
from pyspark.mllib.recommendation import Rating, ALS #mllib库调用Rating，ALS函数
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python'
os.environ['JAVA_HOME'] = '/root/jdk1.8.0_162'

print("使用Spark-ALS算法")
sc = SparkContext('local', 'law_re')
rawData = sc.textFile("hdfs://master:9000/law_data_guiYue")

rawRatings = rawData.map(lambda x: x.split(',')) #以逗号为分隔符分割数据
ratings = rawRatings.map(lambda x: Rating(int(x[0]),int(x[1]),float(x[2]))) #每一行是一个Rating类型数据

rank=50 #潜在因子数
maxIter=10 #迭代次数
param=0.01 #精确度
model = ALS.train(ratings, rank,maxIter,param) #训练模型

# model.save(sc,"hdfs://master:9000/law_model")

# 输出隐语义模型的特征矩阵
userFeatures = model.userFeatures()
print(userFeatures.take(2))

print(model.recommendProducts(27797, 5)) #给指定用户推荐k个网址

while(1):
userId,k = map(int,input('请输入用户ID 以及推荐网页数').split(","))
print(model.recommendProducts(userId, k)) #给指定用户推荐k个网址
urlId,k = map(int,input('请输入网页ID 以及推荐用户数').split(","))
print(model.recommendUsers(urlId,k)) #对指定网址推荐给k个用户

欢迎分享，转载请注明来源：内存溢出

原文地址: https://outofmemory.cn/zaji/5701750.html

课设-Fin版本

发表评论

评论列表（0条）