利用pyspark对于武汉租房数据进行分析,可以爬取不同地区套用本代码。
代码如下:
from pyspark.sql import SparkSession from pyspark.sql.types import IntegerType import pandas as pd from pyspark.ml.stat import Correlation import matplotlib.pyplot as plt spark = SparkSession.builder.master("local").appName("rent_analyse").getOrCreate() df1 = spark.read.csv("zh.csv", header=True,encoding="UTF-8") df1=df1.withColumn("租金",df1.租金.cast(IntegerType())) df1=df1.withColumn("面积",df1.面积.cast(IntegerType())) area=df1.select("区划").distinct().collect() place=[] for i in area: temp=i.asDict() l=list(temp.values()) place.append(l[0]) mean_price=[] max_price=[] min_price=[] mean_size=[] max_size=[] min_size=[] for i in range(len(place)): temp=df1.filter(df1.区划==place[i]) mean_price.append(temp.agg({"租金":"mean"}).first()['avg(租金)']) max_price.append(temp.agg({"租金":"max"}).first()['max(租金)']) min_price.append(temp.agg({"租金":"min"}).first()['min(租金)']) mean_size.append(temp.agg({"面积":"mean"}).first()['avg(面积)']) max_size.append(temp.agg({"面积":"max"}).first()['max(面积)']) min_size.append(temp.agg({"面积":"min"}).first()['min(面积)']) import pyspark.sql.functions as F df_new = df1.withColumn("性价比",F.col("租金")/F.col("面积")) price_number=[] for i in range(27): temp=df1.filter((df1.租金>1000*i)&(df1.租金<=1000*(i+1))) price_number.append(temp.count()) size_number=[] for i in range(12): temp=df1.filter((df1.面积>50*i)&(df1.面积<=50*(i+1))) size_number.append(temp.count()) place_number=[] for i in place: temp=df1.filter(df1.区划==i) place_number.append(temp.count()) subway_rate=[] for i in place: temp=df1.filter(df1.区划==i) temp2=temp.filter(df1.附近地铁!="无") subway_rate.append(temp2.count()/temp.count()) zhibiao=[] zhibiao.append(mean_price) zhibiao.append(max_price) zhibiao.append(min_price) zhibiao.append(mean_size) zhibiao.append(max_size) zhibiao.append(min_size) zhibiao.append(place_number) zhibiao.append(subway_rate) for i in zhibiao: plt.bar(place, i) plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False plt.show() from pyecharts import Bar bar = Bar("武汉市租房租金概况") for i in zhibiao: bar.add(i,place,i,is_stack=True) bar.render()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)