wangtopia

王同学

简单Python脚本

发布于 # 技术 # python

豆瓣读书排行榜

import requests # HTTP请求库
from bs4 import BeautifulSoup  # HTML解析库
import pandas as pd #数据存储
import time
from fake_useragent import UserAgent


def get_douban_books(pages=10):
    books_data = []
    #反爬虫处理,随机生成User-Agent
    ua = UserAgent()

    headers = {
        'User-Agent': ua.random, # 每次请求使用不同的User-Agent
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive'
    }

    for page in range(pages):
        url = f'https://book.douban.com/top250?start={page * 25}'

        try:
            # 发送请求
            response = requests.get(url, headers=headers)
            response.raise_for_status() #检查请求状态,如果不是200会抛出异常
            # 解析HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            books = soup.select('tr.item') # CSS选择器,选择class为item的tr标签

            for book in books:
                try:
                    title = book.select_one('div.pl2 a').get('title')# select_one()返回第一个匹配的元素
                    info = book.select_one('p.pl').text.strip()  # 获取a标签的title属性
                    rating = book.select_one('span.rating_nums').text.strip()# 获取文本内容并去除首尾空白

                    # 评价人数处理
                    rating_people = book.select_one('span.pl').text
                    # 清理字符串,只保留数字,使用filter()函数过滤字符串,只保留数字
                    # str.isdigit() 判断字符是否为数字
                    # filter() 过滤函数,第一个参数是判断条件,第二个参数是可迭代对象
                    # ''.join() 将字符列表连接成字符串
                    rating_people = ''.join(filter(str.isdigit, rating_people))
                    # 条件判断处理可能不存在的元素
                    quote = book.select_one('span.inq')
                    quote = quote.text if quote else ''# 如果元素存在则获取文本,否则返回空字符串

                    # 使用字典存储每本书的信息
                    books_data.append({
                        '书名': title,
                        '基本信息': info,
                        '评分': float(rating),  # 字符串转float
                        '评价人数': int(rating_people),# 字符串转int
                        '一句话评价': quote
                    })

                except Exception as e:
                    print(f'解析单本书籍信息出错: {e}')
                    continue
            # 添加延时,避免请求过快
            time.sleep(2)

        except Exception as e:
            print(f'爬取第{page + 1}页时出错: {e}')
            continue

        print(f'成功爬取第{page + 1}页')

    return books_data


def save_to_excel(books_data, filename='douban_books.xlsx'):
    # 将字典列表转换为DataFrame
    df = pd.DataFrame(books_data)
    # 保存为Excel文件
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f'数据已保存到 {filename}')


if __name__ == '__main__':
    print('开始爬取豆瓣图书排行榜...')
    books_data = get_douban_books(pages=10)
    save_to_excel(books_data)

豆瓣电影top

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from fake_useragent import UserAgent


def clean_text(text):
    """清理文本数据,移除多余的空格和换行符"""
    if text:
        return ' '.join(text.strip().split())
    return ''


def get_douban_movies(pages=10):
    movies_data = []
    ua = UserAgent()

    headers = {
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/web  p,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive'
    }

    for page in range(pages):
        url = f'https://movie.douban.com/top250?start={page * 25}'

        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            movies = soup.select('div.item')

            for movie in movies:
                try:
                    # 获取标题
                    title = movie.select_one('.title').text.strip() if movie.select_one('.title') else '未知'

                    # 获取外文名(如果存在)
                    foreign_title = movie.select_one('.other')
                    foreign_name = clean_text(foreign_title.text) if foreign_title else ''

                    # 获取详细信息
                    info_element = movie.select_one('div.bd p')
                    if info_element:
                        info_text = clean_text(info_element.text)
                        # 分割信息
                        info_parts = info_text.split('/')

                        # 提取导演和演员信息(通常在第一部分)
                        director_actor = info_parts[0] if info_parts else ''

                        # 提取年份、地区、类型(通常在后面几部分)
                        year_region_type = ' / '.join(info_parts[1:]) if len(info_parts) > 1 else ''
                    else:
                        director_actor = ''
                        year_region_type = ''

                    # 获取评分
                    rating = movie.select_one('.rating_num')
                    rating = float(rating.text.strip()) if rating else 0.0

                    # 获取评价人数
                    rating_people = movie.select_one('.star span:last-child')
                    if rating_people:
                        rating_people = ''.join(filter(str.isdigit, rating_people.text))
                        rating_people = int(rating_people) if rating_people else 0
                    else:
                        rating_people = 0

                    # 获取一句话评价
                    quote = movie.select_one('.quote .inq')
                    quote = quote.text.strip() if quote else ''

                    movies_data.append({
                        '中文名': title,
                        '外文名': foreign_name,
                        '导演演员': director_actor,
                        '年份地区类型': year_region_type,
                        '评分': rating,
                        '评价人数': rating_people,
                        '一句话评价': quote
                    })

                except Exception as e:
                    print(f'解析单部电影信息出错: {e}')
                    continue

            print(f'成功爬取第{page + 1}页')
            time.sleep(2)

        except Exception as e:
            print(f'爬取第{page + 1}页时出错: {e}')
            continue

    return movies_data


def save_to_excel(movies_data, filename='douban_movies.xlsx'):
    df = pd.DataFrame(movies_data)
    df.to_excel(filename, index=False, engine='openpyxl')
    print(f'数据已保存到 {filename}')


def main():
    print('开始爬取豆瓣电影Top250...')
    movies_data = get_douban_movies(pages=10)

    if movies_data:
        # 按评分排序
        sorted_movies = sorted(movies_data, key=lambda x: x['评分'], reverse=True)
        save_to_excel(sorted_movies, 'douban_movies_top250.xlsx')
    else:
        print('未获取到数据')


if __name__ == '__main__':
    main()
微信赞赏码微信
支付宝赞赏码支付宝