0


【开源】2024最新python豆瓣电影数据爬虫+可视化分析项目

项目介绍

【开源】项目基于

python+pandas+flask+mysql

等技术实现豆瓣电影数据获取及可视化分析展示,觉得有用的朋友可以来个一键三连,感谢!!!

项目演示

【开源】2024最新python豆瓣电影数据爬虫+可视化分析项目

项目截图

  • 首页在这里插入图片描述
  • 列表页在这里插入图片描述
  • 爬虫演示在这里插入图片描述

项目地址

https://github.com/mudfish/python-douban-view

项目结构

在这里插入图片描述

核心模块

电影爬虫

"""
异步并发爬虫
"""# 本次运行获取的最大页数
MAX_PAGES =5# 进度控制文件
PAGE_PROGRESS_FILE ="page_progress.json"# 电影类型
MOVIE_TYPES =["剧情","喜剧","动作","爱情","科幻","动画"]# CSV文件名
CSV_NAME ="movie_data.csv"# CSV头
CSV_HEADS =["id","movie_id","title","year","directors","casts","rating","cover","country","summary","types","lang","release_date","time","url",]# 上映日期匹配正则,剔除非数字和-
RELEASE_DATE_REMOVE_RE =r"[^0-9-]"

engine = create_engine("mysql+pymysql://root:[email protected]:3306/db_douban")defget_id():returnstr(random.randint(1,100000000))+str(time.time()).split(".")[1].strip()classSpider:def__init__(self):
        self.movie_page_url ="https://m.douban.com/rexxar/api/v2/movie/recommend?"
        self.movie_detail_url ="https://movie.douban.com/subject/{}/"
        self.headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36","Referer":"https://movie.douban.com/explore",}
        self.movie_types = MOVIE_TYPES
        self.page_progress ={}# 需要抓取的页面数
        self.total_pages =0
        self.completed_pages =0
        self.global_progress_bar =Nonedefinit(self):# 每次跑之前,先删除之前的csv文件if os.path.exists(CSV_NAME):
            os.remove(CSV_NAME)withopen(CSV_NAME,"w", newline="", encoding="utf-8")as writer_f:
            writer = csv.writer(writer_f)
            writer.writerow(CSV_HEADS)defload_page_progress(self):if os.path.exists(PAGE_PROGRESS_FILE):withopen(PAGE_PROGRESS_FILE,"r", encoding="utf-8")as f:# 判断文件内容是否为空if os.stat(PAGE_PROGRESS_FILE).st_size ==0:# 初始化页面进度print("初始化页面进度")
                    self.page_progress ={}
                    self.save_page_progress()else:
                    self.page_progress = json.load(f)defsave_page_progress(self):withopen(PAGE_PROGRESS_FILE,"w", encoding="utf-8")as f:
            json.dump(self.page_progress, f, ensure_ascii=False)asyncdefget_movie_pages(self, session, type_name):
        start_page = self.page_progress.get(type_name,1)if start_page <= MAX_PAGES:for page inrange(start_page, MAX_PAGES +1):# print(f'{type_name}第{page}页:')
                start_time = time.time()
                params ={"start":(page -1)*20,"count":10,"tags": type_name}try:asyncwith session.get(
                        self.movie_page_url, headers=self.headers, params=params
                    )as resp:
                        resp.raise_for_status()
                        respJson =await resp.json()
                        movie_list = respJson["items"]for i, m inenumerate(movie_list):if m["type"]=="movie":await self.process_movie(session, m)# progress_bar.update(round(1/len(movie_list)))
                        self.page_progress[type_name]= page +1# 记录进度
                        self.save_page_progress()# 刷新全局进度
                        self.update_global_progress()except Exception as e:print(f"处理:{type_name}第{page}页失败: {e}")
                    traceback.print_exc()continueasyncdefprocess_movie(self, session, movie):
        movie_data =[]
        movie_data.append(get_id())
        movie_data.append(movie["id"])
        movie_data.append(movie["title"])
        movie_data.append(movie["year"])asyncwith session.get(
            self.movie_detail_url.format(movie["id"]), headers=self.headers
        )as resp:
            resp.raise_for_status()
            html_text =await resp.text()
        path = etree.HTML(html_text)# 导演
        movie_data.append(",".join(path.xpath('//a[@rel="v:directedBy"]/text()')))# 主演
        movie_data.append(",".join(path.xpath('//a[@rel="v:starring"]/text()')))# 评分
        movie_data.append(path.xpath('//strong[@property="v:average"]/text()')[0])# 封面
        movie_data.append(path.xpath('//img[@rel="v:image"]/@src')[0])# 国家
        movie_data.append(
            path.xpath('//span[contains(text(),"制片国家")]/following-sibling::br[1]/preceding-sibling::text()[1]')[0].replace(" / ",","))# 摘要
        movie_data.append(path.xpath('//span[@property="v:summary"]/text()')[0].strip())# 类型
        movie_data.append(",".join(path.xpath('//div[@id="info"]/span[@property="v:genre"]/text()')))# 语言
        movie_data.append(
            path.xpath('//span[contains(text(),"语言")]/following-sibling::br[1]/preceding-sibling::text()[1]')[0])# 上映日期
        movie_data.append(
            re.sub(
                RELEASE_DATE_REMOVE_RE,"",
                path.xpath('//span[@property="v:initialReleaseDate"]/text()')[0][:10],))# 时长(空处理)# print(movie["id"])
        movie_time = path.xpath('//span[@property="v:runtime"]/text()')iflen(movie_time)>0:
            movie_data.append(movie_time[0])else:
            movie_data.append("")# url
        movie_data.append(self.movie_detail_url.format(movie["id"]))
        self.save_to_csv(movie_data)defsave_to_csv(self, row):withopen(CSV_NAME,"a", newline="", encoding="utf-8")as f:
            writer = csv.writer(f)
            writer.writerow(row)defclean_csv(self):print("===========清理数据============")
        df = pd.read_csv(CSV_NAME, encoding="utf-8")
        df.drop_duplicates(subset=["movie_id"], keep="first", inplace=True)print("存储到数据库...")
        df.to_sql("tb_movie", con=engine, index=False, if_exists="append")print("清理重复数据...")
        engine.connect().execute(
            text("delete t1 from tb_movie t1 inner join (select min(id) as id,movie_id from tb_movie group by movie_id having count(*) > 1) t2 on t1.movie_id=t2.movie_id where t1.id>t2.id"))defupdate_global_progress(self):
        self.completed_pages +=1# print(self.completed_pages)
        self.global_progress_bar.update(1)
        self.global_progress_bar.refresh()asyncdefrun(self):
        self.init()
        self.load_page_progress()# self.total_pages = MAX_PAGES*len(MOVIE_TYPES) - sum(self.page_progress.get(type_name, 1) for type_name in MOVIE_TYPES)for type_name in MOVIE_TYPES:if MAX_PAGES > self.page_progress.get(type_name,1):
                self.total_pages += MAX_PAGES +1- self.page_progress.get(type_name,1)print(self.total_pages)if self.total_pages >0:
            self.global_progress_bar = tqdm(
                total=self.total_pages, desc="progress", unit="page", colour="GREEN")asyncwith aiohttp.ClientSession()as session:
                tasks =[
                    self.get_movie_pages(session, type_name)for type_name in self.movie_types
                ]await asyncio.gather(*tasks)# 请求结束后,清空页面进度# self.page_progress = {}# self.save_page_progress()
            self.global_progress_bar.close()
            self.clean_csv()if __name__ =="__main__":
    loop = asyncio.get_event_loop()
    spider = Spider()
    loop.run_until_complete(spider.run())

电影可视化

接口代码

from flask import Flask, render_template, request, redirect, url_for, session
from utils import db_query

app = Flask(__name__)
app.secret_key ="mysessionkey"# 统一请求拦截@app.before_requestdefbefore_request():# 利用正则匹配,如果/static开头和/login, /logout,/register的请求,则不拦截;其他的判断是否已登录if(
        request.path.startswith("/static")or request.path =="/login"or request.path =="/logout"or request.path =="/register"):return# 如果没有登录,则跳转到登录页面ifnot session.get("login_username"):return redirect(url_for("login"))# 首页@app.route("/")defindex():# 获取电影统计数据
    movie_stats = db_query.fetch_movie_statistics()# 获取电影分类统计
    movie_type_distribution = db_query.fetch_movie_type_distribution()# 获取电影评分统计
    movie_rating_distribution = db_query.fetch_movie_rating_distribution()print(movie_rating_distribution)return render_template("index.html",
        login_username=session.get("login_username"),
        movie_stats=movie_stats,
        movie_type_distribution=movie_type_distribution,
        movie_rating_distribution=movie_rating_distribution,)# 登录@app.route("/login", methods=["GET","POST"])deflogin():if request.method =="POST":
        req_params =dict(request.form)# 判断用户名密码是否正确
        sql ="SELECT * FROM `tb_user` WHERE `username` = %s AND `password` = %s"
        params =(req_params["username"], req_params["password"])iflen(db_query.query(sql, params))>0:# 存储session
            session["login_username"]= req_params["username"]return redirect(url_for("index"))else:return render_template("error.html",
                error="用户名或密码错误",)elif request.method =="GET":return render_template("login.html")# 退出@app.route("/logout")deflogout():
    session.pop("login_username",None)return redirect(url_for("index"))# 注册@app.route("/register", methods=["GET","POST"])defregister():if request.method =="POST":
        req_params =dict(request.form)if req_params["password"]== req_params["password_confirm"]:# 判断是否已存在该用户名
            sql ="SELECT * FROM `tb_user` WHERE `username` = %s"
            params =(req_params["username"],)
            result = db_query.query(sql, params)iflen(result)>0:return render_template("error.html",
                    error="用户名已存在",)
            sql ="INSERT INTO `tb_user` (`username`, `password`) VALUES (%s, %s)"
            params =(
                req_params["username"],
                req_params["password"],)
            db_query.query(sql, params, db_query.QueryType.NO_SELECT)return redirect(url_for("login"))else:return render_template("error.html",
                error="两次密码输入不一致",)elif request.method =="GET":return render_template("register.html")@app.route("/list")defmovie_list():# 查询数据库获取电影列表
    movies = db_query.fetch_movie_list()# 假设此函数返回一个包含电影信息的列表# 渲染并返回list.html,同时传递movies数据return render_template("list.html", login_username=session.get("login_username"), movies=movies
    )@app.errorhandler(404)defpage_not_found(error):return render_template("404.html"),[email protected](500)defsystem_error(error):return render_template("500.html"),500if __name__ =="__main__":# 静态文件缓存自动刷新
    app.jinja_env.auto_reload =True
    app.run(host="127.0.0.1", port=8002, debug=True)

首页

<!DOCTYPEhtml><htmllang="en"><head><metacharset="utf-8"/><metahttp-equiv="X-UA-Compatible"content="IE=edge"/><metaname="viewport"content="width=device-width, initial-scale=1, shrink-to-fit=no"/><metaname="description"content=""/><metaname="author"content=""/><title>首页</title><!-- Custom fonts for this template--><linkhref="/static/vendor/fontawesome-free/css/all.min.css"rel="stylesheet"type="text/css"/><linkhref="https://fonts.googleapis.com/css?family=Nunito:200,200i,300,300i,400,400i,600,600i,700,700i,800,800i,900,900i"rel="stylesheet"/><!-- Custom styles for this template--><linkhref="/static/css/sb-admin-2.min.css"rel="stylesheet"/></head><bodyid="page-top"><!-- Page Wrapper --><divid="wrapper"><!-- Sidebar --><ulclass="navbar-nav bg-gradient-primary sidebar sidebar-dark accordion"id="accordionSidebar"><!-- Sidebar - Brand --><aclass="sidebar-brand d-flex align-items-center justify-content-center"href="index.html"><divclass="sidebar-brand-icon rotate-n-15"><iclass="fas fa-laugh-wink"></i></div><divclass="sidebar-brand-text mx-3">豆瓣电影可视化</div></a><!-- Divider --><hrclass="sidebar-divider my-0"/><!-- Nav Item - Dashboard --><liclass="nav-item active"><aclass="nav-link"href="/"><iclass="fas fa-fw fa-tachometer-alt"></i><span>首页</span></a></li><!-- 列表 --><liclass="nav-item"><aclass="nav-link"href="/list"><iclass="fas fa-fw fa-table"></i><span>电影列表</span></a></li><!-- Divider --><hrclass="sidebar-divider d-none d-md-block"/><!-- Sidebar Toggler (Sidebar) --><divclass="text-center d-none d-md-inline"><buttonclass="rounded-circle border-0"id="sidebarToggle"></button></div></ul><!-- End of Sidebar --><!-- Content Wrapper --><divid="content-wrapper"class="d-flex flex-column"><!-- Main Content --><divid="content"><!-- Topbar --><navclass="navbar navbar-expand navbar-light bg-white topbar mb-4 static-top shadow"><!-- Sidebar Toggle (Topbar) --><buttonid="sidebarToggleTop"class="btn btn-link d-md-none rounded-circle mr-3"><iclass="fa fa-bars"></i></button><!-- Topbar Search --><!-- <form class="d-none d-sm-inline-block form-inline mr-auto ml-md-3 my-2 my-md-0 mw-100 navbar-search">
            <div class="input-group">
              <input type="text" class="form-control bg-light border-0 small" placeholder="Search for..." aria-label="Search" aria-describedby="basic-addon2">
              <div class="input-group-append">
                <button class="btn btn-primary" type="button">
                  <i class="fas fa-search fa-sm"></i>
                </button>
              </div>
            </div>
          </form> --><!-- Topbar Navbar --><ulclass="navbar-nav ml-auto"><divclass="topbar-divider d-none d-sm-block"></div><!-- Nav Item - User Information --><liclass="nav-item dropdown no-arrow"><aclass="nav-link dropdown-toggle"href="#"id="userDropdown"role="button"data-toggle="dropdown"aria-haspopup="true"aria-expanded="false"><spanclass="mr-2 d-none d-lg-inline text-gray-600 small">{{login_username}}</span><imgclass="img-profile rounded-circle"src="/static/img/avatar.png"/></a><!-- Dropdown - User Information --><divclass="dropdown-menu dropdown-menu-right shadow animated--grow-in"aria-labelledby="userDropdown"><aclass="dropdown-item"href="#"data-toggle="modal"data-target="#logoutModal"><iclass="fas fa-sign-out-alt fa-sm fa-fw mr-2 text-gray-400"></i>
                    Logout
                  </a></div></li></ul></nav><!-- End of Topbar --><!-- Begin Page Content --><divclass="container-fluid"><!-- Page Heading --><!-- <div class="d-sm-flex align-items-center justify-content-between mb-4">
            <h1 class="h3 mb-0 text-gray-800">Dashboard</h1>
            <a href="#" class="d-none d-sm-inline-block btn btn-sm btn-primary shadow-sm"><i class="fas fa-download fa-sm text-white-50"></i> Generate Report</a>
          </div> --><!-- Content Row --><divclass="row"><!-- Earnings (Monthly) Card Example --><divclass="col-xl-3 col-md-6 mb-4"><divclass="card border-left-primary shadow h-100 py-2"><divclass="card-body"><divclass="row no-gutters align-items-center"><divclass="col mr-2"><divclass="font-weight-bold text-primary text-uppercase mb-1">
                          电影总数
                        </div><divclass="h5 mb-0 font-weight-bold text-gray-800">
                          {{ movie_stats['total_movies'] }}
                        </div></div><divclass="col-auto"><iclass="fas fa-calendar fa-2x text-gray-300"></i></div></div></div></div></div><!-- Earnings (Monthly) Card Example --><divclass="col-xl-3 col-md-6 mb-4"><divclass="card border-left-success shadow h-100 py-2"><divclass="card-body"><divclass="row no-gutters align-items-center"><divclass="col mr-2"><divclass="font-weight-bold text-success text-uppercase mb-1">
                          电影最高评分
                        </div><divclass="h5 mb-0 font-weight-bold text-gray-800">
                          {{ movie_stats['highest_rating'] }}
                        </div></div><divclass="col-auto"><iclass="fas fa-dollar-sign fa-2x text-gray-300"></i></div></div></div></div></div><!-- Earnings (Monthly) Card Example --><divclass="col-xl-3 col-md-6 mb-4"><divclass="card border-left-info shadow h-100 py-2"><divclass="card-body"><divclass="row no-gutters align-items-center"><divclass="col mr-2"><divclass="font-weight-bold text-info text-uppercase mb-1">
                          出演最多演员
                        </div><divclass="row no-gutters align-items-center"><divclass="col-auto"><divclass="h5 mb-0 mr-3 font-weight-bold text-gray-800">
                              {{ movie_stats['most_popular_cast'] }}
                            </div></div><divclass="col"><divclass="progress progress-sm mr-2"><divclass="progress-bar bg-info"role="progressbar"style="width: 50%"aria-valuenow="50"aria-valuemin="0"aria-valuemax="100"></div></div></div></div></div><divclass="col-auto"><iclass="fas fa-clipboard-list fa-2x text-gray-300"></i></div></div></div></div></div><!-- Pending Requests Card Example --><divclass="col-xl-3 col-md-6 mb-4"><divclass="card border-left-warning shadow h-100 py-2"><divclass="card-body"><divclass="row no-gutters align-items-center"><divclass="col mr-2"><divclass="font-weight-bold text-warning text-uppercase mb-1">
                          制片最多国家
                        </div><divclass="h5 mb-0 font-weight-bold text-gray-800">
                          {{ movie_stats['most_common_country'] }}
                        </div></div><divclass="col-auto"><iclass="fas fa-comments fa-2x text-gray-300"></i></div></div></div></div></div></div><!-- Content Row --><divclass="row"><!-- Area Chart --><divclass="col-xl-6 col-lg-6"><divclass="card shadow mb-4"><!-- Card Header - Dropdown --><divclass="card-header py-3 d-flex flex-row align-items-center justify-content-between"><h6class="m-0 font-weight-bold text-primary">
                      电影分类统计
                    </h6></div><!-- Card Body --><divclass="card-body"><divid="movie_type_chart"style="width: 100%;height: 450px"></div><!-- <div class="chart-area">
                      
                    </div> --></div></div></div><!-- Line Chart --><divclass="col-xl-6 col-lg-6"><divclass="card shadow mb-4"><!-- Card Header - Dropdown --><divclass="card-header py-3 d-flex flex-row align-items-center justify-content-between"><h6class="m-0 font-weight-bold text-primary">
                      电影评分统计
                    </h6><divclass="dropdown no-arrow"><aclass="dropdown-toggle"href="#"role="button"id="dropdownMenuLink"data-toggle="dropdown"aria-haspopup="true"aria-expanded="false"><iclass="fas fa-ellipsis-v fa-sm fa-fw text-gray-400"></i></a><divclass="dropdown-menu dropdown-menu-right shadow animated--fade-in"aria-labelledby="dropdownMenuLink"><divclass="dropdown-header">Dropdown Header:</div><aclass="dropdown-item"href="#">Action</a><aclass="dropdown-item"href="#">Another action</a><divclass="dropdown-divider"></div><aclass="dropdown-item"href="#">Something else here</a></div></div></div><!-- Card Body --><divclass="card-body"><divid="movie_score_chart"style="width: 100%;height: 450px"></div></div></div></div></div><!-- Content Row --></div><!-- /.container-fluid --></div><!-- End of Main Content --><!-- Footer --><footerclass="sticky-footer bg-white"><divclass="container my-auto"><divclass="copyright text-center my-auto"><span>@Laoxu Open Source.<atarget="_blank"href="https://github.com/mudfish">Github</a></span></div></div></footer><!-- End of Footer --></div><!-- End of Content Wrapper --></div><!-- End of Page Wrapper --><!-- Scroll to Top Button--><aclass="scroll-to-top rounded"href="#page-top"><iclass="fas fa-angle-up"></i></a><!-- Logout Modal--><divclass="modal fade"id="logoutModal"tabindex="-1"role="dialog"aria-labelledby="exampleModalLabel"aria-hidden="true"><divclass="modal-dialog"role="document"><divclass="modal-content"><divclass="modal-header"><h5class="modal-title"id="exampleModalLabel">Ready to Leave?</h5><buttonclass="close"type="button"data-dismiss="modal"aria-label="Close"><spanaria-hidden="true">×</span></button></div><!-- <div class="modal-body">Select "Logout" below if you are ready to end your current session.</div> --><divclass="modal-footer"><buttonclass="btn btn-secondary"type="button"data-dismiss="modal">
              Cancel
            </button><aclass="btn btn-primary"href="/logout">Logout</a></div></div></div></div><!-- Bootstrap core JavaScript--><scriptsrc="/static/vendor/jquery/jquery.min.js"></script><scriptsrc="/static/vendor/bootstrap/js/bootstrap.bundle.min.js"></script><!-- Core plugin JavaScript--><scriptsrc="/static/vendor/jquery-easing/jquery.easing.min.js"></script><!-- Custom scripts for all pages--><scriptsrc="/static/js/sb-admin-2.min.js"></script><!-- Page level plugins --><scriptsrc="/static/vendor/chart.js/Chart.min.js"></script><!-- Page level custom scripts --><scriptsrc="/static/js/demo/chart-area-demo.js"></script><scriptsrc="/static/js/demo/chart-pie-demo.js"></script><scriptsrc="/static/js/echarts.min.js"></script><script>var chartDom = document.getElementById("movie_type_chart");var myChart = echarts.init(chartDom);var option;var movieTypeData ={{ movie_type_distribution|tojson }};// console.log(movieTypeData)

      option ={title:{text:"",subtext:"来源:豆瓣数据",left:"center",},tooltip:{trigger:"item",},legend:{orient:"vertical",left:"left",},series:[{name:"Access From",type:"pie",radius:"50%",data: movieTypeData,emphasis:{itemStyle:{shadowBlur:10,shadowOffsetX:0,shadowColor:"rgba(0, 0, 0, 0.5)",},},},],};

      option && myChart.setOption(option);</script><script>var chartDom = document.getElementById("movie_score_chart");var myChart = echarts.init(chartDom);var option;var ratingData ={{ movie_rating_distribution|tojson }};
      console.log(ratingData)

      option ={title:{text:"",subtext:"来源:豆瓣数据",left:"center",},xAxis:{type:"category",boundaryGap:false,data: ratingData.map(item=> item[0]),},yAxis:{type:"value",},series:[{data: ratingData.map(item=> item[1]),type:"line",areaStyle:{},},],tooltip:{trigger:'axis',//坐标轴触发,主要在柱状图,折线图等会使用类目轴的图表中使用axisPointer:{// 坐标轴指示器,坐标轴触发有效type:'shadow'// 默认为直线,可选为:'line' | 'shadow'}},};

      option && myChart.setOption(option);</script></body></html>
标签: 开源 python 爬虫

本文转载自: https://blog.csdn.net/IndexMan/article/details/139044200
版权归原作者 罗汉爷 所有, 如有侵权,请联系我们删除。

“【开源】2024最新python豆瓣电影数据爬虫+可视化分析项目”的评论:

还没有评论