python采集腾讯新闻某个用户的发表数据列表以及详情页数据到数据库-编程爱好者之家

python采集腾讯新闻某个用户的发表数据列表以及详情页数据到数据库

2024-07-17 13:20:44 22
编程爱好者之家为大家带来python采集腾讯新闻某个用户的发表数据列表以及详情页数据到数据库
一：首先获取腾讯新闻你要采集的某个用户的列表所在数据的网页地址。

按照图示F12-->network 然后有一个getSubNewsMixedList这个里面就是整个列表的数据
（PS:直接浏览器里面的页面地址是采集不到列表页数据的）
getSubNewsMixedList里面数据如图所示
了解了数据结构就可以继续下面的操作了
二：具体代码如下

注意：详情页的数据查看源代码，数据都在window.DATA 里面。直接页面获取代码是获取不到的
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlretrieve
import datetime
import pymysql
import random
import re
import  string
from pymysql import Connection
import time
import json
import os
import datetime


#按照日期创建文件夹方法
def create_folder(path):
    # 年
    year = datetime.datetime.now().strftime('%Y')
    # 年月日
    day = datetime.datetime.now().strftime('%Y%m%d')

    foldername = path + "/" + year + "/" + day
    folder = year + "/" + day

    # 文件路径
    word_name = os.path.exists(foldername)

    # 判断文件是否存在：不存在创建
    if not word_name:
        os.makedirs(foldername)
    return [foldername,folder];



#获取当前时间一个小时之后的随机时间戳
def get_random_next_hour_timestamp():
    # 获取当前时间的时间戳
    now = time.time()

    # 随机生成小时、分钟和秒
    random_hour = 1  # 因为我们要生成的是当前时间的下一个小时
    random_minute = random.randint(0, 59)
    random_second = random.randint(0, 59)

    # 创建一个timedelta对象表示随机的小时
    from datetime import timedelta
    random_time_delta = timedelta(hours=random_hour, minutes=random_minute, seconds=random_second)

    # 将随机时间加到当前时间上
    random_time = now + (random_time_delta - timedelta(hours=1)).total_seconds()

    # 将结果转换为时间戳
    return int(random_time)


#下载图片方法
def download_images(image_list, output_folder,domain):
    imgarr = []
    newfolder = create_folder(output_folder)

    for index, image_url in enumerate(image_list):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
                "Referer": "https://www.baidu.com"
            }

            response = requests.get(image_url,headers=headers)
            try:
                response.raise_for_status()

                #生成日期随机名称
                day = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
                random_number = random.randint(1000, 100000)
                fname=  str(day)+str(random_number)

                file_name = str(fname) + '.jpg'  # 使用数字索引作为文件名
                file_path = os.path.join(newfolder[0], file_name)
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                    imgarr.append(str(domain) + str("/") + str(newfolder[1]) + str("/") + file_name)
            except:
                imgarr.append('https://www.codelovers.cn/default.jpg')

        except requests.exceptions.RequestException as e:
            print(f"Error downloading image at index {index}: {str(e)}")
    return imgarr

#获取列表页详情标题，链接地址，图片地址函数
def getList(url):

    resp = requests.get(url)
    resp = resp.json()
    # resp.encoding = 'utf-8'

    result = {}
    datas = []
    tags = resp['newslist']

    #链接数据库
    conn = Connection(
        host='localhost',
        port=3306,
        user='数据库用户名',
        passwd='数据库密码',
        autocommit=True
    )

    cursor = conn.cursor()
    conn.select_db("数据库名")

    for tag in tags:
        result['url'] = tag['short_url']
        result['title'] =  tag['title']

        cursor.execute("SELECT id FROM blog_article WHERE title = '%s'" % (result['title']))
        res = cursor.fetchone()
        if res:
            continue
        else:
            infolist = getInfo(result['url'], '/uploads/')

            #插入数据并更新url_link
            dt = datetime.datetime.fromtimestamp(get_random_next_hour_timestamp())
            nowtime = dt.strftime("%Y-%m-%d %H:%M:%S")
            insertid = str(time.strftime('%Y%m%d%H')) + str(random.randint(1000, 9999))

            cursor.execute("INSERT ignore INTO article (ID,title,seotitle,fengmian,neirong,addtime) VALUES (%s, %s,%s, %s, %s,%s)",(insertid, infolist[0], infolist[0], infolist[2], infolist[1], nowtime))

            upsql = "UPDATE article SET status=1,category_id=1 WHERE id = %s"
            cursor.execute(upsql, insertid)

            conn.commit()
    conn.close()

    datas.append(result)
    return datas


#批量替换字符串
def replace_strings(text, replacements):
    for old_str, new_str in replacements.items():
        text = text.replace(old_str, new_str)
    return text


#批量替换图片
def replace_imgstrings(text, replacements):
    for old_str, new_str in replacements.items():
        text = text.replace("<!--"+str(old_str)+"-->", '<img src="'+str(new_str)+'" />')
    return text

def find_all_img_tags(html):
    # 正则表达式用于匹配<img>标签
    img_pattern = re.compile(r'<img[^>]*?>')
    # 查找所有匹配项
    img_tags = img_pattern.findall(html)
    return img_tags

def extract_image_urls(html):
    # 正则表达式用于匹配<img>标签中的src属性
    image_urls = re.findall(r'<img[^>]*src="([^"]*)"', html)
    return image_urls

#获取详情页内容函数
def getInfo(url,save_dir):
    imgs = []
    result = []
    imgurl = []
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, "html.parser")

    title = soup.find("h1").get_text()
    cn = soup.find_all("script")[2].text
    start_index = cn.index('{')
    end_index = cn.rindex('}')
    json_str = cn[start_index:end_index + 1]
    content = json.loads(json_str)
    contents = content['originContent']['text']
    imgs = content['originAttribute']

    imgdic = {}
    for name,url in imgs.items():
        imgdic[name] = url['url']

    contentss = replace_imgstrings(str(contents), imgdic)


    #处理文章中图片并下载到本地服务器
    imgarr=extract_image_urls(contentss)

    if imgarr:

        imgurl = download_images(imgarr,save_dir,'https://www.codelovers.cn/uploads')

        #替换原文图片地址
        dic = {}
        for index, img_tag in enumerate(imgarr):

            dic[img_tag] = imgurl[index]

        newcontent = replace_strings(str(contentss),dic)
    else:
        newcontent = str(contentss)

    newcontent =  newcontent.replace("\n",'')
    newcontent = newcontent.replace('<!--HPOS_0-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_0-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_1-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_2-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_3-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_4-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_5-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_6-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_7-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_8-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_9-->', '')
    newcontent = newcontent.replace('<!--PARAGRAPH_10-->', '')
    newcontent = newcontent.replace('<STRONG>', '')
    newcontent = newcontent.replace('</STRONG>', '')
    newcontent = newcontent.replace('<h1>', '<h3>')
    newcontent = newcontent.replace('</h1>', '</h3>')
    newcontent = newcontent.replace("\xa0", '')

    result.append(title)
    result.append(newcontent)

    if len(imgurl)!=0:
        result.append(imgurl[0])
    else:
        result.append('https://www.codelovers.cn/default.jpg')

    return result


getList("采集的网址")