问题
pip安装: https://blog.csdn.net/qq_37176126/article/details/72824404
基础
urllib
import urllib.request
# get
# response = urllib.request.urlopen("http://www.baidu.com")
# print(response.read().decode("utf-8"))
# post
# import urllib.parse
# data = bytes(urllib.parse.urlencode({"hello":"world"}), encoding="utf-8")
# response = urllib.request.urlopen("http://httpbin.org/post", data= data)
# print(response.read().decode("utf-8"))
# 超时处理
# try:
# response = urllib.request.urlopen("http://www.baidu.com", timeout=0.1) #超时
# print(response.read().decode("utf-8"))
# except urllib.error.URLError as e:
# print(e, "超时")
# response = urllib.request.urlopen("http://www.mingyuefusu.cn", timeout=0.1) #超时
# print(response.status) # 418 拒绝爬虫
# print(response.getheaders())
# print(response.getheader("Server"))
# 模拟浏览器
# url = "http://httpbin.org/post"
# headers = {
# "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
# }
# data = bytes(urllib.parse.urlencode({'name' : 'ming'}), encoding="utf-8")
# req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
# response = urllib.request.urlopen(req)
# print(response.read().decode("utf-8"))
# 访问豆瓣
url = "https://www.douban.com"
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
bs4
'''
beautifulSoup4将HTML文档转成树形结构,每个节点都是python对象
对象分为
- Tag 拿到第一个找到的标签及其内容
- NavigableString 标签里的内容 .string `
- BeautifulSoup
- Comment
'''
from bs4 import BeautifulSoup
file = open("./baidu.htm", "rb") # 二进制
html = file.read().decode("utf-8")
bs = BeautifulSoup(html, "html.parser")
# 1. Tag
# print(bs.title) # 获取标签
# print(bs.a)
# print(type(bs.body))
# 2. NavigableString
# print(bs.title.string) # 只获取标签内容
# print(type(bs.title.string))
# print(bs.a.attrs) # 获取属性值
# 3. BeautifulSoup
# print(type(bs))
# print(bs.name)
# print(bs) # 文档内容
# 4.comment 特殊的NavigableString,不包含注释
# print(bs.a.string)
# ------------------------
# 1. 文档的遍历
# print(bs.head.contents)
print(bs.head.contents[2])
# 2. 文档的搜索
# (1) find_all() 找到所有标签
# 字符串过滤:查找与字符串完全匹配的内容
# t_list = bs.find_all("a")
# print(t_list)
# 正则表达式搜索:search()匹配
import re
# t_list = bs.find_all(re.compile("a"))
# print(t_list)
# 方法: 传入一个函数,根据函数的要求来搜索
def name_is_exists(tag):
return tag.has_attr("name")
t_list = bs.find_all(name_is_exists)
# print(t_list)
# 遍历
# for list in t_list:
# print(list)
# 2. kwargs 参数
# t_list = bs.find_all(id = "head")
# t_list = bs.find_all(class_ = True)
# t_list = bs.find_all(href="http://wenku.baidu.com/search?word=&lm=0&od=0&ie=utf-8")
# for item in t_list:
# print(item)
# 3. text参数
# t_list = bs.find_all(text="hao123")
# t_list = bs.find_all(text=["hao123", "地图", "贴吧", "???"])
# t_list = bs.find_all(text = re.compile("\d")) # 正则表达式
# 4. limit
# t_list = bs.find_all("a", limit = 3)
# 5. 选择器 标签、id "#title"、类 ".mnav"、属性 "a[class='button']"
# select("body>div>a") 子标签 ~兄弟 t_list[0].get_text()
print(bs.select('title'))
t_list = bs.select('.mnav')
for item in t_list:
print(item)
正则
import re
# 创建模式对象
pat = re.compile("AA") #标准
res = pat.search("dAAAd") # 被校验
print(res)
# 没有模式对象
# res = re.search("AA", "DAAA") # 前面模板,后面被校验
# print(res)
# res = re.findall("AA", "AAAAAA") #前面是标准,后面是被校验
# res = re.findall("[A-Z]+", "AAAAAA") #前面是标准,后面是被校验
res = re.sub("A", "a", "AAAAAA") # 将A 换成 a
print(res)
# 建议使用r,不用担心转义字符
# a = r"\assd\'"
xlwt
import xlwt
workbook = xlwt.Workbook(encoding="utf-8") # 创建workbook对象
worksheet = workbook.add_sheet('sheet1') # 创建工作表
for h in range(1, 10):
for l in range(1, h+1):
# worksheet.write(h, l, )
#print("{0}*{1}={2} ".format(h, l, h*l), end="")
worksheet.write(h, l, "{0}*{1}={2} ".format(l, h, h*l))
#("%d * %d = %d"%(h,l,h*l))
#print()
# worksheet.write(0,0,'hello') # 写入数据,(0,0) 内容
# worksheet.write(0,1,'Excel') # 写入数据,(0,0) 内容
workbook.save('helloExcel.xls') # 保存表格
实例
爬取豆瓣电影top250
存入excel
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,文字匹配
import urllib.request, urllib.error #制定url,获取网页数据
import xlwt # excel操作
import sqlite3 # sqllite数据库操作
findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式模式,匹配出a连接中的网址
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S . 可以匹配换行符
findTitle = re.compile(r'<span class="title">(.*?)</span>') # 匹配影片名称
findRate = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') # 影片评分
findComNum = re.compile(r'<span>(\d*?)人评价</span>') # 评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') # 影片概述
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 影片标签
# 爬取网页
def getData(baseUrl):
datalist = []
for i in range(0, 10):
url = baseUrl + str(i * 25)
html = askURL(url) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
# print(item)
data = [] # 保存一部电影的所有信息
item = str(item)
# 模式
link = re.findall(findLink, item)[0]
data.append(link)
imgSrc =re.findall(findImgSrc, item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if(len(titles) == 2):
ctitle = titles[0]
data.append(ctitle)
otitle = titles[1].replace("/", "")
data.append(otitle)
else:
data.append(titles[0])
data.append(" ")
rate = re.findall(findRate, item)[0]
data.append(rate)
comNum = re.findall(findComNum, item)[0]
data.append(comNum)
inq = re.findall(findInq, item)
if(len(inq) != 0):
inq = inq[0].replace("。", "")
else:
inq = " "
data.append(inq)
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)
bd = re.sub('/', " ", bd)
data.append(bd.strip())
datalist.append(data)
# for data in datalist:
# print(data)
return datalist
# 保存数据
def saveData(datalist, savePath):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook的对象,样式
sheet = workbook.add_sheet('豆瓣电影Top250', cell_overwrite_ok = True) # 覆盖
col = ('链接', '图片链接', '标题', '外国标题', '评分', '评价人数', '概述', '影片标签')
for i in range(0, 8):
sheet.write(0, i, col[i])
for i in range(0, 250):
data = datalist[i]
for j in range(0, 8):
sheet.write(i+1, j, data[j])
workbook.save(savePath)
# 得到url的网页内容
def askURL(url):
# 进行伪装,告诉豆瓣,我们的类型,接受什么样的数据
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
}
request = urllib.request.Request(url, headers=headers)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def main():
baseUrl = "https://movie.douban.com/top250?start="
# 1.爬取网页
datalist = getData(baseUrl)
mySavePath = "豆瓣电影Top250.xls"
# 3.保存数据
saveData(datalist, mySavePath)
if __name__ == "__main__":
main()
print("爬取结束!")
存入sqlite
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,文字匹配
import urllib.request, urllib.error #制定url,获取网页数据
import xlwt # excel操作
import sqlite3 # sqllite数据库操作
findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式模式,匹配出a连接中的网址
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S . 可以匹配换行符
findTitle = re.compile(r'<span class="title">(.*?)</span>') # 匹配影片名称
findRate = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>') # 影片评分
findComNum = re.compile(r'<span>(\d*?)人评价</span>') # 评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') # 影片概述
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 影片标签
# 爬取网页
def getData(baseUrl):
datalist = []
for i in range(0, 10):
url = baseUrl + str(i * 25)
html = askURL(url) # 保存获取到的网页源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('div', class_="item"):
# print(item)
data = [] # 保存一部电影的所有信息
item = str(item)
# 模式
link = re.findall(findLink, item)[0]
data.append(link)
imgSrc =re.findall(findImgSrc, item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if(len(titles) == 2):
ctitle = titles[0]
data.append(ctitle)
otitle = titles[1].replace("/", "")
data.append(otitle)
else:
data.append(titles[0])
data.append(" ")
rate = re.findall(findRate, item)[0]
data.append(rate)
comNum = re.findall(findComNum, item)[0]
data.append(comNum)
inq = re.findall(findInq, item)
if(len(inq) != 0):
inq = inq[0].replace("。", "")
else:
inq = " "
data.append(inq)
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)
bd = re.sub('/', " ", bd)
data.append(bd.strip())
datalist.append(data)
# for data in datalist:
# print(data)
return datalist
# 保存数据
def saveData(datalist, savePath):
workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook的对象,样式
sheet = workbook.add_sheet('豆瓣电影Top250', cell_overwrite_ok = True) # 覆盖
col = ('链接', '图片链接', '标题', '外国标题', '评分', '评价人数', '概述', '影片标签')
for i in range(0, 8):
sheet.write(0, i, col[i])
for i in range(0, 250):
data = datalist[i]
for j in range(0, 8):
sheet.write(i+1, j, data[j])
workbook.save(savePath)
# 得到url的网页内容
def askURL(url):
# 进行伪装,告诉豆瓣,我们的类型,接受什么样的数据
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
}
request = urllib.request.Request(url, headers=headers)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def main():
baseUrl = "https://movie.douban.com/top250?start="
# 1.爬取网页
datalist = getData(baseUrl)
mySavePath = "豆瓣电影Top250.xls"
dbpath = 'movie.db'
# 3.保存数据
# saveData(datalist, mySavePath)
save2db(datalist, dbpath)
def save2db(datalist, dbpath):
# init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
# 一行
for index in range(len(data)):
data[index] = '"' +data[index] +'"'
sql = '''
insert into movie250(
info_link, pic_link, cname, ename, score, rate, introduction, info)
values(%s)'''%",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(dbpath):
sql = '''
create table movie250(
id integer primary key autoincrement,
info_link text,
pic_link text,
cname varchar,
ename varchar,
score numeric,
rate numeric,
introduction text,
info text
)
'''
conn = sqlite3.connect(dbpath)
db = conn.cursor()
db.execute(sql)
conn.commit()
conn.close()
if __name__ == "__main__":
main()
# save2db("movie.db")
print("爬取结束!")