编程爱好者之家
编程爱好者之家为大家带来python采集腾讯新闻某个用户的发表数据列表以及详情页数据到数据库
按照图示F12-->network 然后有一个getSubNewsMixedList这个里面就是整个列表的数据
(PS:直接浏览器里面的页面地址是采集不到列表页数据的)
getSubNewsMixedList里面数据如图所示
了解了数据结构就可以继续下面的操作了
注意:详情页的数据查看源代码,数据都在window.DATA 里面。直接页面获取代码是获取不到的
import requests import os from bs4 import BeautifulSoup from urllib.parse import urljoin from urllib.request import urlretrieve import datetime import pymysql import random import re import string from pymysql import Connection import time import json import os import datetime #按照日期创建文件夹方法 def create_folder(path): # 年 year = datetime.datetime.now().strftime('%Y') # 年月日 day = datetime.datetime.now().strftime('%Y%m%d') foldername = path + "/" + year + "/" + day folder = year + "/" + day # 文件路径 word_name = os.path.exists(foldername) # 判断文件是否存在:不存在创建 if not word_name: os.makedirs(foldername) return [foldername,folder]; #获取当前时间一个小时之后的随机时间戳 def get_random_next_hour_timestamp(): # 获取当前时间的时间戳 now = time.time() # 随机生成小时、分钟和秒 random_hour = 1 # 因为我们要生成的是当前时间的下一个小时 random_minute = random.randint(0, 59) random_second = random.randint(0, 59) # 创建一个timedelta对象表示随机的小时 from datetime import timedelta random_time_delta = timedelta(hours=random_hour, minutes=random_minute, seconds=random_second) # 将随机时间加到当前时间上 random_time = now + (random_time_delta - timedelta(hours=1)).total_seconds() # 将结果转换为时间戳 return int(random_time) #下载图片方法 def download_images(image_list, output_folder,domain): imgarr = [] newfolder = create_folder(output_folder) for index, image_url in enumerate(image_list): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", "Referer": "https://www.baidu.com" } response = requests.get(image_url,headers=headers) try: response.raise_for_status() #生成日期随机名称 day = datetime.datetime.now().strftime('%Y%m%d%H%M%S') random_number = random.randint(1000, 100000) fname= str(day)+str(random_number) file_name = str(fname) + '.jpg' # 使用数字索引作为文件名 file_path = os.path.join(newfolder[0], file_name) with open(file_path, 'wb') as file: file.write(response.content) imgarr.append(str(domain) + str("/") + str(newfolder[1]) + str("/") + file_name) except: imgarr.append('https://www.codelovers.cn/default.jpg') except requests.exceptions.RequestException as e: print(f"Error downloading image at index {index}: {str(e)}") return imgarr #获取列表页详情标题,链接地址,图片地址函数 def getList(url): resp = requests.get(url) resp = resp.json() # resp.encoding = 'utf-8' result = {} datas = [] tags = resp['newslist'] #链接数据库 conn = Connection( host='localhost', port=3306, user='数据库用户名', passwd='数据库密码', autocommit=True ) cursor = conn.cursor() conn.select_db("数据库名") for tag in tags: result['url'] = tag['short_url'] result['title'] = tag['title'] cursor.execute("SELECT id FROM blog_article WHERE title = '%s'" % (result['title'])) res = cursor.fetchone() if res: continue else: infolist = getInfo(result['url'], '/uploads/') #插入数据并更新url_link dt = datetime.datetime.fromtimestamp(get_random_next_hour_timestamp()) nowtime = dt.strftime("%Y-%m-%d %H:%M:%S") insertid = str(time.strftime('%Y%m%d%H')) + str(random.randint(1000, 9999)) cursor.execute("INSERT ignore INTO article (ID,title,seotitle,fengmian,neirong,addtime) VALUES (%s, %s,%s, %s, %s,%s)",(insertid, infolist[0], infolist[0], infolist[2], infolist[1], nowtime)) upsql = "UPDATE article SET status=1,category_id=1 WHERE id = %s" cursor.execute(upsql, insertid) conn.commit() conn.close() datas.append(result) return datas #批量替换字符串 def replace_strings(text, replacements): for old_str, new_str in replacements.items(): text = text.replace(old_str, new_str) return text #批量替换图片 def replace_imgstrings(text, replacements): for old_str, new_str in replacements.items(): text = text.replace("<!--"+str(old_str)+"-->", '<img src="'+str(new_str)+'" />') return text def find_all_img_tags(html): # 正则表达式用于匹配<img>标签 img_pattern = re.compile(r'<img[^>]*?>') # 查找所有匹配项 img_tags = img_pattern.findall(html) return img_tags def extract_image_urls(html): # 正则表达式用于匹配<img>标签中的src属性 image_urls = re.findall(r'<img[^>]*src="([^"]*)"', html) return image_urls #获取详情页内容函数 def getInfo(url,save_dir): imgs = [] result = [] imgurl = [] resp = requests.get(url) soup = BeautifulSoup(resp.content, "html.parser") title = soup.find("h1").get_text() cn = soup.find_all("script")[2].text start_index = cn.index('{') end_index = cn.rindex('}') json_str = cn[start_index:end_index + 1] content = json.loads(json_str) contents = content['originContent']['text'] imgs = content['originAttribute'] imgdic = {} for name,url in imgs.items(): imgdic[name] = url['url'] contentss = replace_imgstrings(str(contents), imgdic) #处理文章中图片并下载到本地服务器 imgarr=extract_image_urls(contentss) if imgarr: imgurl = download_images(imgarr,save_dir,'https://www.codelovers.cn/uploads') #替换原文图片地址 dic = {} for index, img_tag in enumerate(imgarr): dic[img_tag] = imgurl[index] newcontent = replace_strings(str(contentss),dic) else: newcontent = str(contentss) newcontent = newcontent.replace("\n",'') newcontent = newcontent.replace('<!--HPOS_0-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_0-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_1-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_2-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_3-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_4-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_5-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_6-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_7-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_8-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_9-->', '') newcontent = newcontent.replace('<!--PARAGRAPH_10-->', '') newcontent = newcontent.replace('<STRONG>', '') newcontent = newcontent.replace('</STRONG>', '') newcontent = newcontent.replace('<h1>', '<h3>') newcontent = newcontent.replace('</h1>', '</h3>') newcontent = newcontent.replace("\xa0", '') result.append(title) result.append(newcontent) if len(imgurl)!=0: result.append(imgurl[0]) else: result.append('https://www.codelovers.cn/default.jpg') return result getList("采集的网址")
windows11安装Java8(jdk1.8)详细教程
linux系统安装python 3.12.0教程
python字符串requests获取数据怎么转换为字典
python采集B站某个用户的发表图文数据列表以及详情页数据到数据库代码
python采集微博某个用户的发表数据列表以及详情页数据到数据库
windows系统在cmd中执行 pip install numpy没反应解决办法
linux安装好python3后使用python命令提示-bash: python: command not found
python获取当前时间三个小时之后的随机时间戳
python删除网页中含有lazy.png字符串的img标签并返回删除后的字符串
windows 10 11系统安装Anaconda详细教程