豆瓣爬虫到数据库

admin 发表于 2022-5-18 08:50:43

from selenium.webdriver import Chrome
import re
import pymysql
from pymysql.converters import escape_string
web=Chrome()
web.get("https://movie.douban.com/top250")
#获取数据库的链接
conn=pymysql.connect(
   db="douban516",
   user="root",
   passwd="103415",
   port=3306,
   charset="utf8"
)
# 实现将数据保存到数据库的功能
def saveToMySQL(data):
print("正在保存数据到数据库.....")

#获取数据库的游标
cur=conn.cursor()
#执行sql语句
sql="insert into movie(moviename, href, director, pf, judgenum, gs) values('%s','%s','%s','%s','%s','%s')"
#执行sql
# print(sql%data)
cur.execute(sql%data)

while True:
lis = web.find_elements_by_xpath('//*[@id="content"]/div/div/ol/li')

for li in lis:
   # 获取电影名称
   moviename = li.find_element_by_xpath('.//div[@class="hd"]/a').text
   moviename=escape_string(moviename)
   # 获取a的href链接
   href = li.find_element_by_xpath('.//div[@class="hd"]/a').get_attribute("href")
   # 获取导演信息
   director = li.find_element_by_xpath('.//div[@class="bd"]//p').text
   director=escape_string(director)
   # 获取评分信息
   pf = li.find_element_by_xpath('.//span[@class="rating_num"]').text
   # 获取评论人数
   judgenum = li.find_element_by_xpath('.//div[@class="star"]/span').text
   # 使用正则表达式获取评价人数的数字
   judgenum = re.findall('\d+', judgenum)
   # 使用findall方法返回的是一个列表需要转成字符串
   judgenum = ''.join(judgenum)
   # 获取影片的概述信息
   try:
         gs = li.find_element_by_xpath('.//span[@class="inq"]').text
         gs=escape_string(gs)
   except:
         gs=""
   mess_tup = (moviename, href, director, pf, judgenum, gs)
   #保存数据到数据库
   saveToMySQL(mess_tup)
   # print(mess_tup)
try:
   web.find_element_by_xpath('//span[@class="next"]/a').click()
   web.implicitly_wait(5)
except:
   break
web.close()

页: [1]

php中文网 | cnphp.com's Archiver

豆瓣爬虫到数据库