0


基于Spark的淘宝数据分析学习案例

​​​​​cx​​​​​​​https://dblab.xmu.edu.cn/blog/3525/ 参考地址​​​​​​​

一,下载数据集

数据集

数据集的介绍

  数据集大小

![](https://img-blog.csdnimg.cn/direct/6de48a8cea63411594485446deeacb70.png)

二,数据预处理

使用python对数据进行预处理,由于数据量比较大,选取前一百万条数据

import csv
import time
print("执行")
start_time = time.mktime(time.strptime('2017-11-25', '%Y-%m-%d'))
end_time = time.mktime(time.strptime('2017-12-03', '%Y-%m-%d'))
i = 0
with open("D:\\UserBehavior.csv\\UserBehavior.csv", 'r') as fr:
    reader = csv.reader(fr)
    for row in reader:
        if row[3] != "pv":
            if start_time < int(row[4]) < end_time:
                str_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(row[4])))
                info = [row[0], row[1], row[3], str_time]
                i += 1
                if i > 10000:
                    break

                with open("D:\\UserBehavior.csv\\processed2.csv", "a+", newline='') as fw:
                    writer = csv.writer(fw)
                    writer.writerow(info)
fr.close()
fw.close()
t2=time.time()

三,数据分析

使用spark对数据进行处理

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

import java.io.{File, PrintWriter}

object Main {
  def main(args: Array[String]): Unit = {
    println("Hello world!")
    val sparkConf=new SparkConf().setMaster("local").setAppName("a")
    val sc =new SparkContext(sparkConf)
    val spark=SparkSession.builder().getOrCreate()
    val inputfile="D:\\UserBehavior.csv\\processed.csv"
    import spark.implicits._
    //用户行为信息统计
    val UserBehaviorDF=sc.textFile(inputfile).map(_.split(",")).map(attributes=>Info(attributes(0).trim.toInt,attributes(1).trim.toInt,attributes(2),attributes(3))).toDF()
    val UserBehaviorCount=UserBehaviorDF.groupBy("action").count()
    val result1=UserBehaviorCount.toJSON.collectAsList().toString
    val writer1=new PrintWriter(new File("D:\\Json\\result1.json"))
    writer1.write(result1)
    writer1.close()
    //购物数量前十的用户
    val userBehavior_top10=UserBehaviorDF.filter(UserBehaviorDF("action")==="buy").select(UserBehaviorDF("userId")).rdd.map(v=>(v(0).toString,1)).reduceByKey(_+_).sortBy(_._2,false).take(10)
    val result2=sc.parallelize(userBehavior_top10).toDF().toJSON.collectAsList().toString
    val writer2=new PrintWriter(new File("D:\\Json\\result2.json"))
    writer2.write(result2)
    writer2.close()
    //销量前十的商品
    val item_top10=UserBehaviorDF.filter(UserBehaviorDF("action")==="buy").select(UserBehaviorDF("itemId")).rdd.map(v=>(v(0).toString,1)).reduceByKey(_+_).sortBy(_._2,false).take(10)
    val result3=sc.parallelize(item_top10).toDF().toJSON.collectAsList().toString
    val writer3=new PrintWriter(new File("D:\\Json\\result3.json"))
    writer3.write(result3)
    writer3.close()
    //时间段平台销量统计
    val buy_order_by_date=UserBehaviorDF.filter(UserBehaviorDF("action")==="buy").select(UserBehaviorDF("time")).rdd.map(v=>(v.toString().replace("[","").replace("]","").split(" ")(0),1)).reduceByKey(_+_).sortBy(_._1).collect()
    val result4=sc.parallelize(buy_order_by_date).toDF().toJSON.collectAsList().toString
    val writer4=new PrintWriter(new File("D:\\Json\\result4.json"))
    writer4.write(result4)
    writer4.close()
    sc.stop()
  }
}

四,数据可视化

使用flask和echarts进行数据可视化

from flask import Flask, render_template
import json
import jinja2

app = Flask(__name__)

with open("D:\\Json\\result1.json",'r') as file1:
    data = json.load(file1)
with open("D:\\Json\\result2.json") as file2:
    data2 = json.load(file2)
with open("D:\\Json\\result3.json") as file3:
    data3 = json.load(file3)
with open("D:\\Json\\result4.json") as file4:
    data4 = json.load(file4)
@app.route('/')
def index():
    # 将数据转换为ECharts饼图所需格式
    series_data1 = [{'name': item['action'], 'value': item['count']} for item in data]
    series_data2 = [{'name': item['_1'], 'value': item['_2']} for item in data2]
    categries_data2=[ item["_1"] for item in data2]
    series_data3 = [{'name':item['_1'],"value":item['_2'] }for item in data3]
    categries_data3 = [item["_1"] for item in data3]
    series_data4 = [{'name': item['_1'], 'value': item['_2']} for item in data4]
    categries_data4=[item["_1"] for item in data4]

    # 将数据传递给模板
    return render_template('demo.html',series_data1=series_data1, series_data2=series_data2, categries_data2=categries_data2,series_data3=series_data3, categries_data3=categries_data3,categries_data4=categries_data4,series_data4=series_data4)

if __name__ == '__main__':
    app.run(debug=True)
<!DOCTYPE html>
<html>
 <head>
    <meta charset="utf-8" />
    <title>淘宝数据可视化</title>

    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/echarts.min.js"></script>
      <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  </head>
  <body>
        <h1 align="center">Spark淘宝数据分析可视化图表</h1>
        <div class="empty"></div>
        <div class="contain">
        <div id="box1" style="width: 1500px;height:400px"></div>
        <div id="box2" style="width: 1500px;height:600px"></div>
        <div id="box3" style="width: 1500px;height:400px"></div>
        <div id="box4" style="width: 1500px;height:600px"></div>
        </div>

        <script type="text/javascript">

         var myChart1 = echarts.init(document.getElementById('box1'))

           var option = {
             title: {
                text: "购物量前十的用户",
                left: 'center'
             },

          tooltip: {},

         xAxis: {
               type: 'category',
           data:{{ categries_data2|tojson }}
          },
        yAxis: { type:"value"},
         series: [
          {
             data: {{ series_data2|tojson }},
                type: 'bar'

          }
        ]
      };
      myChart1.setOption(option)
    </script>

    <script  type="text/javascript">
        var myChart2 = echarts.init(document.getElementById('box2'))

        option2 = {
  title: {
    text: "Action",
    left: 'center'
  },
  tooltip: {
    trigger: 'item'
  },
  legend: {
    orient: 'vertical',
    left:""
  },
  series: [
    {
      name: 'Access From',
      type: 'pie',
      radius: '60%',
        data: {{ series_data1|tojson}},
      emphasis: {
        itemStyle: {
          shadowBlur: 10,
          shadowOffsetX: 0,
          shadowColor: 'rgba(0, 0, 0, 0.5)'
        }
      }
    }
  ]
};

myChart2.setOption(option2)

    </script>
  <script type="text/javascript">

         var myChart3 = echarts.init(document.getElementById('box3'))

           var option3 = {
             title: {
                text: "商品销量前十",
                left: 'center'
             },

          tooltip: {},

         xAxis: {
               type: 'category',
           data:{{ categries_data3|tojson }}
          },
        yAxis: { type:"value"},
         series: [
          {
             data: {{ series_data3|tojson }},
                type: 'bar'

          }
        ]
      };
      myChart3.setOption(option3)
    </script>
  <script type="text/javascript">

         var myChart4 = echarts.init(document.getElementById('box4'))
         option4 = {
              title: {
                text: "11月25日到12月3日平台销量统计",
                left: 'center'
             },
              tooltip: {
                   trigger: 'item'
              },
              xAxis: {
                 type: 'category',
                 data: {{ categries_data4|tojson }}
              },
              yAxis: {
                  type: 'value'
              },
             series: [
                  {
                    data: {{ series_data4|tojson }},
                    type: 'line'
                  }
             ]
         };
      myChart4.setOption(option4)
    </script>

  </body>
</html>

标签: 学习 spark

本文转载自: https://blog.csdn.net/L666223/article/details/137358766
版权归原作者 by远不止于此 所有, 如有侵权,请联系我们删除。

“基于Spark的淘宝数据分析学习案例”的评论:

还没有评论