编程爱好者之家
编程爱好者之家为大家带来python采集微博某个用户的发表数据列表以及详情页数据到数据库的具体代码
(PS:直接获取页面内容是获取不到的,数据在接口里面)
接口地址可通过浏览器 F12-->network 里面去查找 mymblog 开头数据就是接口地址
接口里面数据json格式化后如下:所需要采集的标题,详情页链接地址在 url_struct 里面
二:具体代码如下
import requests import os from bs4 import BeautifulSoup from urllib.parse import urljoin from urllib.request import urlretrieve import datetime import pymysql import random import re import string from pymysql import Connection import time import urllib.request import re from requests import RequestException import os import datetime import json #按照日期创建文件夹方法 def create_folder(path): # 年 year = datetime.datetime.now().strftime('%Y') # 年月日 day = datetime.datetime.now().strftime('%Y%m%d') foldername = path + "/" + year + "/" + day folder = year + "/" + day # 文件路径 word_name = os.path.exists(foldername) # 判断文件是否存在:不存在创建 if not word_name: os.makedirs(foldername) return [foldername,folder]; #判断不包含海贼王的字符串 def does_not_contain_one_piece(s): return "海贼王" not in s #获取当前时间三个小时后的随机时间戳 def random_timestamp_in_three_hours(): # 当前时间戳 current_timestamp = time.time() # 3小时的秒数 three_hours_in_seconds = 3 * 3600 # 随机秒数(范围在0到3小时的秒数内) random_seconds = random.randrange(three_hours_in_seconds) # 随机时间戳 random_timestamp = current_timestamp + random_seconds return int(random_timestamp) #下载图片方法 def download_images(image_list, output_folder,domain): imgarr = [] newfolder = create_folder(output_folder) for index, image_url in enumerate(image_list): try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", "Referer": "https://weibo.com" } response = requests.get(image_url,headers=headers) try: response.raise_for_status() #生成日期随机名称 day = datetime.datetime.now().strftime('%Y%m%d%H%M%S') random_number = random.randint(1000, 100000) fname= str(day)+str(random_number) file_name = str(fname) + '.jpg' # 使用数字索引作为文件名 file_path = os.path.join(newfolder[0], file_name) with open(file_path, 'wb') as file: file.write(response.content) imgarr.append(str(domain) + str("/") + str(newfolder[1]) + str("/") + file_name) except: imgarr.append('https://www.codelovers.cn/default.jpg') except requests.exceptions.RequestException as e: print(f"Error downloading image at index {index}: {str(e)}") return imgarr #获取列表页详情标题,链接地址,图片地址函数 def getList(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36", "Referer": "https://weibo.com" } cookies = { "cookie": "b-user-id=1ace80de-cec1-f7d2-6f97-e75e6fdc72d1; SINAGLOBAL=8461759422508.164.1704334076448; UOR=,,www.baidu.com; SCF=Aqv8vefHkkcK09KTAay8SrTkajfshEX4i70hMRlpGj3LAuYQ9jaTMPOCbaWz8H0ZjiYuV0_jf6pkotkR3TIteig.; ULV=1721186579194:133:15:4:8635286197188.025.1721186579190:1721115212235; XSRF-TOKEN=HfJystghBUtKCoiNfJ4TdiNf; ALF=1723801332; SUB=_2A25Lk-GkDeRhGeBO4loT-SbPzjSIHXVo0XtsrDV8PUJbkNAbLWTtkW1NRZ9YdyZl4M1sdlEcN1sXgtP6smYuNU6Z; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhknqH2zSLepSXXbdRPTjTR5JpX5KzhUgL.Foq71KnE1Kn0SKn2dJLoIpjLxKML12-LB-zLxKMLB-zLBo.LxKqL1KqL12zt; WBPSESS=Dt2hbAUaXfkVprjyrAZT_G-6RpX6aDEyx5GTIs9qxkhDgt8lDoID-y3APuOogTS6Jy7F4WJunK6FVfGyohfhicEbfViv3z9d_d-ML7wnEHcOw3Cdld7sG03Lv_PAmCOqjQiBV4ExNR-Oct5huEUsk21X7JLgYLKWVbi0MTU8IZg=" } resp = requests.get(url, headers=headers,cookies=cookies) resp = resp.json() tags = resp['data']['list'] result = {} datas = [] #链接数据库 conn = Connection( host='localhost', port=3306, user='root', passwd='anke123', autocommit=True ) cursor = conn.cursor() conn.select_db("meizhuzhai") for tag in tags: # print(tag['url_struct'][0]) result['url'] = tag['url_struct'][0]['long_url'] result['title'] = tag['url_struct'][0]['url_title'] cursor.execute("SELECT id FROM blog_article WHERE title = '%s'" % (result['title'])) res = cursor.fetchone() if res: continue else: infolist = getInfo(result['url'], '/home/uploads/') if infolist: #插入数据并更新url_link dt = datetime.datetime.fromtimestamp(random_timestamp_in_three_hours()) nowtime = dt.strftime("%Y-%m-%d %H:%M:%S") insertid = str(time.strftime('%Y%m%d%H')) + str(random.randint(1000, 9999)) cursor.execute("INSERT ignore INTO article (id,title,seotitle,fengmian,neirong,`jianjie`,addtime) VALUES (%s, %s,%s, %s, %s,%s,%s)", (insertid,infolist[0],infolist[0], infolist[2],infolist[1],infolist[0],nowtime)) upsql = "UPDATE article SET status=0,url_link = %s WHERE id = %s" cursor.execute(upsql, ('/news/' + str(insertid) + '.html', insertid)) conn.commit() conn.close() datas.append(result) return datas #批量替换字符串 def replace_strings(text, replacements): for old_str, new_str in replacements.items(): text = text.replace(old_str, new_str) return text def remove_attribute(html, tag, attribute): """删除指定HTML标签的属性""" # 构建正则表达式来匹配标签和属性 pattern = r'<{0}[^>]*{1}=["\'][^"\']+["\'][^>]*>'.format(tag, attribute) # 编译正则表达式以提高性能 regex = re.compile(pattern) # 使用正则表达式进行匹配并替换 new_html = regex.sub(lambda match: match.group(0).split(attribute)[0] + '>', html) return new_html #获取详情页内容函数 def getInfo(url,save_dir): imgs = [] result = [] imgurl = [] headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", } resp = requests.get(url, headers=headers) try: soup = BeautifulSoup(resp.content, "html.parser") content = soup.find('div', class_='WB_editor_iframe_new') #处理文章中图片并下载到本地服务器 img=content.find_all("img") if img: for img_tag in img: src = img_tag.get('src') imgs.append(src) imgurl = download_images(imgs, save_dir, 'https://www.codelovers.cn/uploads') #替换原文图片地址 dic = {} for index, img_tag in enumerate(img): newsrc = img_tag.get('src') dic[newsrc] = imgurl[index] newcontent = replace_strings(str(content),dic) else: newcontent = str(content) newcontent = newcontent.replace("\n",'') newcontent = newcontent.replace('<div class="WB_editor_iframe_new" node-type="contentBody" style="visibility: hidden">', '') newcontent = newcontent.replace('<div class="DCI_v2 clearfix">', '') newcontent = newcontent.replace("</div>", '') newcontent = newcontent.replace('<figure class="image">', '') newcontent = newcontent.replace('</figure>', '') newcontent = newcontent.replace('<strong>', '') newcontent = newcontent.replace('</strong>', '') newcontent = newcontent.replace('<h1>', '<h3>') newcontent = newcontent.replace('</h1>', '</h3>') newcontent = newcontent.replace('sizes="100vw"', '') newcontent = newcontent.replace('<p>\xa0</p>', '') newcontent = remove_attribute(newcontent,'img','srcset') result.append(soup.find('div', class_='title').text) result.append(newcontent) if len(imgurl)!=0: result.append(imgurl[0]) else: result.append('https://www.codelovers.cn/default.jpg') return result except AttributeError: return ''
最后执行的时候只要执行 getList(接口地址)即可
windows11安装Java8(jdk1.8)详细教程
python采集B站某个用户的发表图文数据列表以及详情页数据到数据库代码
python字符串requests获取数据怎么转换为字典
python采集微博某个用户的发表数据列表以及详情页数据到数据库
python获取当前时间三个小时之后的随机时间戳
linux安装好python3后使用python命令提示-bash: python: command not found
linux系统安装python 3.12.0教程
windows系统在cmd中执行 pip install numpy没反应解决办法
python删除网页中含有lazy.png字符串的img标签并返回删除后的字符串
ImportError: Can't connect to HTTPS URL because the SSL module is not available.