Python之requests+xpath爬取猫眼电影并写入数据库(图文教程)
2020-06-28 11:38:30 来源:易采站长站 作者:易采站长站整理
print("正在储存电影......".format(name))
actors_ar = tree.xpath('//div[@class="celebrity-group"][2]//li/div[@class="info"]/a/text()') # 演员列表
types = tree.xpath('string(//li[@class="ellipsis"])').replace("n", "").replace(" ", "") # 字符串
intro = str(tree.xpath('string(//span[@class="dra"])'))
actors = '|'.join(actors_ar).replace("n", "").replace(" ", "") #将演员列表拼接为字符串
结果你们可以自己去打印一下,如果没有遇到验证码就能爬到了
最后的抓取猫眼电影的全部代码,封装成类,养成良好的代码习惯
import requests
from lxml import etree
from mysql_api import mysqlConn
from fake_useragent import UserAgent
from pymysql import errclass maoYan_spider:
headers = {
"User-Agent": UserAgent().random
}
def get_urls(self, url):
"""返回一个电影首页捕获到的所有海报地址和电影详情url"""
print("url: " + url)
resp = requests.get(url=url, headers=self.headers)
tree = etree.HTML(resp.text)
# 完整的图片地址,可以直接打开
img_ar = tree.xpath('//dl/dd//img[2]/@data-src')
# 只有地址的后半段,需要拼接'https://maoyan.com'
urls_ar = tree.xpath('//dd/div[@class="movie-item film-channel"]/a/@href')
#只有py具有返回多个参数的特性,其他语言只能返回一个
return img_ar, urls_ar
def save_data(self, img_src, url):
"""将电影详情写入数据库"""
#print("url: " + url)
resp = requests.get(url=url, headers=self.headers)
tree = etree.HTML(resp.content.decode("utf-8"))
name = str(tree.xpath('string(//h1)'))
print("正在储存电影......".format(name))
if name == "":
print("遇到验证码, 程序停止")
return False
actors_ar = tree.xpath('//div[@class="celebrity-group"][2]//li/div[@class="info"]/a/text()') # 演员列表
types = tree.xpath('string(//li[@class="ellipsis"])').replace("n", "").replace(" ", "") # 字符串
intro = str(tree.xpath('string(//span[@class="dra"])'))
actors = '|'.join(actors_ar).replace("n", "").replace(" ", "") #将演员列表拼接为字符串
sql = 'insert into maoyan (m_name, m_type, m_src, m_link, m_intro, m_actors) values ("%s","%s","%s","%s","%s","%s")' % (name, types, img_src, url, intro, actors)
try:
self.connect.exe_sql(sql)
except err.ProgrammingError:
print("该条编码有问题,舍弃")
return True
def run(self):
self.connect = mysqlConn()
self.connect.get_conn("movies")
tag = True
#爬取前两页的电影
for i in range(2):
main_url = "https://maoyan.com/films?showType=3&offset={}".format(30 * i)













闽公网安备 35020302000061号