CREATE TABLE law ( ip bigint, area int, ie_proxy string, ie_type string , userid string, clientid string, time_stamp bigint, time_format string, pagepath string, ymd int, visiturl string, page_type string, host string, page_title string, page_title_type int, page_title_name string, title_keyword string, in_port string, in_url string, search_keyword string, source string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;
导入数据
load data inpath '/lawdata.csv' overwrite into table law;2、统计表law中网页类型
#查询表law中总记录数 select count(1) from law;
select substring(page_type,2,3) as page_type, count(*) as count_num, round((count(*)/837502.0)*100,4) as weights from law group by substring(page_type,2,3) order by count_num desc limit 7;
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nostrict; ----创建动态分区表law_part CREATE TABLE law_part ( ip bigint, area int, ie_proxy string, ie_type string , userid string, clientid string, time_stamp bigint, time_format string, pagepath string, ymd int, visiturl string, page_type string, host string, page_title string, page_title_type int, page_title_name string, title_keyword string, in_port string, in_url string, search_keyword string, source string) partitioned by(page_type_part string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; -----导入数据到分区 INSERT OVERWRITE TABLE law_part PARTITION (page_type_part) SELECt *,substring(page_type,2,3) FROM law ; -----统计101网页类型的子类型 select substring(page_type,2,6) as page_type,count(*) as count_num,round((count(*)/411665.0)*100,4) as weights from law_part where page_type_part=101 group by substring(page_type,2,6) order by count_num desc;
----统计访问网页中带有“?”的记录总数 select count(*) as num from law_part where visiturl like '%?%';
----统计带有?的所有记录中,各网页类型所占比例 select substring(page_type,2,6) as page_type,count(*),round((count(*)*100)/65357,4) as weights from law where visiturl like '%?%' group by substring(page_type,2,6) order by weights desc limit 5;3、点击次数分析
----用户个数统计 select count(distinct(userid)) from law;
----创建点击次数分区表,分区字段click_part set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nostrict; create table law_click ( user_num int, user_weights double, record_weights double ) partitioned by (click_part string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;
----导入分区数据到分区表law_click INSERT OVERWRITE TABLE law_click PARTITION (click_part) select count(click_num) as count,round(count(click_num)*100/347609.0,2),round((count(click_num)*click_num)*100/837502.0,2),click_num from ( select count(userid) as click_num from law group by userid ) tmp_table group by click_num order by count desc limit 8;
----点击次数分析 set hive.exec.reducers.max=1; select click_num, count(click_num) as count,round(count(click_num)*100/347609.0,2),round((count(click_num)*click_num)*100/837502.0,2)from ( select count(userid) as click_num from law group by userid ) tmp_table group by click_num order by count desc limit 8;
----浏览一次用户行为分析 select page_type,count(page_type) as count,round((count(page_type)*100)/227732.0,4) from (select substring(a.page_type,2,6) as page_type from law a,(select userid from law group by userid having(count(userid)=1)) b where a.userid = b.userid) c group by page_type order by count desc limit 5;
----点击一次用户浏览网页统计分析 select a.visiturl,count(*) as count from law a,(select userid from law group by userid having(count(userid)=1)) b where a.userid = b.userid group by a.visiturl order by count desc limit 7;
-----访问量前十大网页排名 select a.visiturl,count(*) as count from law a where a.visiturl like '%.html%' group by a.visiturl order by count desc limit 10;pig数据清洗过程
law = load '/lawdate.csv' using PigStorage (','); law_filter = filter law by matches '.*\.html' and not ( matches '.*midques_.*') and not ( matches '.*browse.*') and not ( matches '.*\?.*') and not ( matches '.*法律快车-律师助手.*') and (SUBSTRINg(,0,3) == '101' or SUBSTRINg(,0,3) == '107'); //law_dist_field = foreach law_filter generate ,,; law_code_field = foreach law_filter generate ,,,; set job.name 'store_result'; //user,url,timestamp //store law_dist_field into '/law_cleaned' using PigStorage(','); //timestamp,user,url,urlType store law_code_field into '/law_code_cleaned' using PigStorage(',');Spark数据编码归约
// 加载数据,原始数据:timestamp,user,url,urlType val rawDataPath = "/law_code_cleaned/*" val dataAl1 = sc.textFile(rawDataPath).map{x => val fields=x.split(","); (fields(0),fields(1),fields(2),fields(3))} // 排序去重后用户、URL数据 val userUrl = dataAl1.map(x => (x._2,x._3)) val alluserList = userUrl.map(data=>data._1).distinct.sortBy(x=>x) val allUrlList = userUrl.map(data=>data._2).distinct.sortBy(x=>x) // 构造用户、URL编码 val allUserIdList = alluserList.zipWithIndex.map(data=>(data._1,data._2.toString)) val allUrlIdList = allUrlList.zipWithIndex.map(data=>(data._1,data._2.toString)) // 保存编码数据 allUserIdList.map(x => x._1 +","+x._2).repartition(1).saveAsTextFile("/law_userlist_Coding") allUrlIdList.map(x => x._1 +","+x._2).repartition(1).saveAsTextFile("/law_urllist_Coding") // 替换原始数据 val replacedDataAl1 = dataAl1.map(x => (x._2,(x._1,x._3,x._4))). join(allUserIdList).map(x => (x._2._1._2,(x._2._1._1,x._2._2,x._2._1._3))).join(allUrlIdList).map(x => (x._2._1._1,x._2._1._2,x._2._2,x._2._1._3)) // 保存编码后数据 replacedDataAl1.saveAsTextFile ("/law_data_Coding") import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def initRatingwithNum(sc:SparkContext, inputDir: String, splitVisitedNumArray: Array[Int],splitVisitedNumValues : Array[Int]): RDD[(Int, Int,Int)]= { val dataRaw = sc.textFile(inputDir).map { x => val fields = x.slice(1,x.size-1)split(","); (fields(1).toInt, fields(2).toInt)} val dataCount = dataRaw.map(x => (x, 1)).reduceByKey((x,y)=>(x+y)). filter(x=> x._2 >= splitVisitedNumArray(0)) val spliterPointwithValue = (-1 +: splitVisitedNumArray).zip(splitVisitedNumArray:+ Integer.MAX_VALUE). zip(splitVisitedNumValues) dataCount.map{x=>(x._1._1,x._1._2,spliterPointwithValue.filter(y => y._1._1 >= x._2 && y._1._2 > x._2)(0)._2)} } //不一样需要改地址 val data = initRatingwithNum(sc,"/law_data_Coding/*",Array(2,5,10,20,20000),Array(0,2,3,4,5,6)) data.map(x =>x._1+","+x._2+","+x._3) data.saveAsTextFile ("/law_data_Guiyue")ALS模型建立
# -*- coding: utf-8 -*-
# @Time : 2022/1/8 21:01
# @Author : ..
# @Email : 1464770648@qq.com
# @File : Test3.py
# @Software: PyCharm
from pyspark import SparkContext
from pyspark.mllib.recommendation import Rating, ALS #mllib库调用Rating,ALS函数
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python'
os.environ['JAVA_HOME'] = '/root/jdk1.8.0_162'
print("使用Spark-ALS算法")
sc = SparkContext('local', 'law_re')
rawData = sc.textFile("hdfs://master:9000/law_data_guiYue")
rawRatings = rawData.map(lambda x: x.split(',')) #以逗号为分隔符分割数据
ratings = rawRatings.map(lambda x: Rating(int(x[0]),int(x[1]),float(x[2]))) #每一行是一个Rating类型数据
rank=50 #潜在因子数
maxIter=10 #迭代次数
param=0.01 #精确度
model = ALS.train(ratings, rank,maxIter,param) #训练模型
# model.save(sc,"hdfs://master:9000/law_model")
# 输出隐语义模型的特征矩阵
userFeatures = model.userFeatures()
print(userFeatures.take(2))
print(model.recommendProducts(27797, 5)) #给指定用户推荐k个网址
while(1):
userId,k = map(int,input('请输入用户ID 以及推荐网页数').split(","))
print(model.recommendProducts(userId, k)) #给指定用户推荐k个网址
urlId,k = map(int,input('请输入网页ID 以及推荐用户数').split(","))
print(model.recommendUsers(urlId,k)) #对指定网址推荐给k个用户
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)