问题

pip安装: https://blog.csdn.net/qq_37176126/article/details/72824404

基础

urllib

import urllib.request

# get
# response = urllib.request.urlopen("http://www.baidu.com")
# print(response.read().decode("utf-8"))

# post
# import urllib.parse
# data = bytes(urllib.parse.urlencode({"hello":"world"}), encoding="utf-8")
# response = urllib.request.urlopen("http://httpbin.org/post", data= data)
# print(response.read().decode("utf-8"))

# 超时处理
# try:
#     response = urllib.request.urlopen("http://www.baidu.com", timeout=0.1) #超时
#     print(response.read().decode("utf-8"))
# except urllib.error.URLError as e:
#     print(e, "超时")

# response = urllib.request.urlopen("http://www.mingyuefusu.cn", timeout=0.1) #超时
# print(response.status)  # 418 拒绝爬虫
# print(response.getheaders())
# print(response.getheader("Server"))

# 模拟浏览器
# url = "http://httpbin.org/post"
# headers = {
#     "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
# }
# data = bytes(urllib.parse.urlencode({'name' : 'ming'}), encoding="utf-8")
# req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
# response = urllib.request.urlopen(req)
# print(response.read().decode("utf-8"))

# 访问豆瓣
url = "https://www.douban.com"
headers = {
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
}
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))

bs4

'''
    beautifulSoup4将HTML文档转成树形结构,每个节点都是python对象
    对象分为
    - Tag   拿到第一个找到的标签及其内容
    - NavigableString   标签里的内容 .string `
    - BeautifulSoup
    - Comment
'''

from bs4 import BeautifulSoup
file = open("./baidu.htm", "rb") # 二进制
html = file.read().decode("utf-8")
bs = BeautifulSoup(html, "html.parser")
# 1. Tag
# print(bs.title) # 获取标签
# print(bs.a)
# print(type(bs.body))

# 2. NavigableString
# print(bs.title.string)     # 只获取标签内容
# print(type(bs.title.string))

# print(bs.a.attrs)   # 获取属性值

# 3. BeautifulSoup
# print(type(bs))
# print(bs.name)
# print(bs)   # 文档内容

# 4.comment 特殊的NavigableString,不包含注释
# print(bs.a.string)

# ------------------------

# 1. 文档的遍历
# print(bs.head.contents)
print(bs.head.contents[2])

# 2. 文档的搜索

# (1) find_all() 找到所有标签
# 字符串过滤:查找与字符串完全匹配的内容

# t_list = bs.find_all("a")
# print(t_list)

# 正则表达式搜索:search()匹配
import re
# t_list = bs.find_all(re.compile("a"))
# print(t_list)

# 方法: 传入一个函数,根据函数的要求来搜索
def name_is_exists(tag):
    return tag.has_attr("name")

t_list = bs.find_all(name_is_exists)
# print(t_list)
# 遍历
# for list in t_list:
#     print(list)

# 2. kwargs 参数
# t_list = bs.find_all(id = "head")
# t_list = bs.find_all(class_ = True)
# t_list = bs.find_all(href="http://wenku.baidu.com/search?word=&lm=0&od=0&ie=utf-8")
# for item in t_list:
#     print(item)

# 3. text参数
# t_list = bs.find_all(text="hao123")
# t_list = bs.find_all(text=["hao123", "地图", "贴吧", "???"])
# t_list = bs.find_all(text = re.compile("\d")) # 正则表达式

# 4. limit
# t_list = bs.find_all("a", limit = 3)

# 5. 选择器 标签、id "#title"、类 ".mnav"、属性 "a[class='button']"
# select("body>div>a") 子标签 ~兄弟  t_list[0].get_text()
print(bs.select('title'))
t_list = bs.select('.mnav')
for item in t_list:
    print(item)

正则

UTOOLS1592719314074.png

import re
# 创建模式对象
pat = re.compile("AA") #标准
res = pat.search("dAAAd") # 被校验
print(res)

# 没有模式对象
# res = re.search("AA", "DAAA") # 前面模板,后面被校验
# print(res)

# res = re.findall("AA", "AAAAAA") #前面是标准,后面是被校验
# res = re.findall("[A-Z]+", "AAAAAA") #前面是标准,后面是被校验
res = re.sub("A", "a", "AAAAAA") # 将A 换成 a
print(res)
# 建议使用r,不用担心转义字符
# a = r"\assd\'"

xlwt

import xlwt
workbook = xlwt.Workbook(encoding="utf-8")      # 创建workbook对象
worksheet = workbook.add_sheet('sheet1')    # 创建工作表
for h in range(1, 10):
    for l in range(1, h+1):
        # worksheet.write(h, l, )
        #print("{0}*{1}={2} ".format(h, l, h*l), end="")
        worksheet.write(h, l, "{0}*{1}={2} ".format(l, h, h*l))
        #("%d * %d = %d"%(h,l,h*l))
    #print()
# worksheet.write(0,0,'hello')    # 写入数据,(0,0) 内容
# worksheet.write(0,1,'Excel')    # 写入数据,(0,0) 内容
workbook.save('helloExcel.xls')   # 保存表格

实例

爬取豆瓣电影top250

存入excel

from bs4 import BeautifulSoup  # 网页解析,获取数据
import re   # 正则表达式,文字匹配
import urllib.request, urllib.error    #制定url,获取网页数据
import xlwt     # excel操作
import sqlite3  # sqllite数据库操作

findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式模式,匹配出a连接中的网址
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S . 可以匹配换行符
findTitle = re.compile(r'<span class="title">(.*?)</span>') # 匹配影片名称
findRate = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')    # 影片评分
findComNum = re.compile(r'<span>(\d*?)人评价</span>') # 评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') # 影片概述
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 影片标签

# 爬取网页
def getData(baseUrl):
    datalist = []
    for i in range(0, 10):
        url = baseUrl + str(i * 25)
        html = askURL(url)  # 保存获取到的网页源码
        # 2.逐一解析数据
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('div', class_="item"):
            # print(item)
            data = [] # 保存一部电影的所有信息
            item = str(item)
            # 模式
            link = re.findall(findLink, item)[0]
            data.append(link)

            imgSrc =re.findall(findImgSrc, item)[0]
            data.append(imgSrc)

            titles = re.findall(findTitle, item)
            if(len(titles) == 2):
                ctitle = titles[0]
                data.append(ctitle)
                otitle = titles[1].replace("/", "")
                data.append(otitle)
            else:
                data.append(titles[0])
                data.append(" ")

            rate = re.findall(findRate, item)[0]
            data.append(rate)

            comNum = re.findall(findComNum, item)[0]
            data.append(comNum)

            inq = re.findall(findInq, item)
            if(len(inq) != 0):
                inq = inq[0].replace("。", "")
            else:
                inq = " "
            data.append(inq)

            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)
            bd = re.sub('/', " ", bd)
            data.append(bd.strip())
            datalist.append(data)
    # for data in datalist:
    #     print(data)
    return datalist

# 保存数据
def saveData(datalist, savePath):
    workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook的对象,样式
    sheet = workbook.add_sheet('豆瓣电影Top250', cell_overwrite_ok = True) # 覆盖
    col = ('链接', '图片链接', '标题', '外国标题', '评分', '评价人数', '概述', '影片标签')

    for i in range(0, 8):
        sheet.write(0, i, col[i])
    for i in range(0, 250):
        data = datalist[i]
        for j in range(0, 8):
            sheet.write(i+1, j, data[j])
    workbook.save(savePath)

# 得到url的网页内容
def askURL(url):
    # 进行伪装,告诉豆瓣,我们的类型,接受什么样的数据
    headers = &#123;
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
    &#125;
    request = urllib.request.Request(url, headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


def main():
    baseUrl = "https://movie.douban.com/top250?start="
    # 1.爬取网页
    datalist = getData(baseUrl)
    mySavePath = "豆瓣电影Top250.xls"
    # 3.保存数据
    saveData(datalist, mySavePath)

if __name__ == "__main__":
    main()
    print("爬取结束!")

存入sqlite

from bs4 import BeautifulSoup  # 网页解析,获取数据
import re   # 正则表达式,文字匹配
import urllib.request, urllib.error    #制定url,获取网页数据
import xlwt     # excel操作
import sqlite3  # sqllite数据库操作

findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式模式,匹配出a连接中的网址
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S . 可以匹配换行符
findTitle = re.compile(r'<span class="title">(.*?)</span>') # 匹配影片名称
findRate = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')    # 影片评分
findComNum = re.compile(r'<span>(\d*?)人评价</span>') # 评价人数
findInq = re.compile(r'<span class="inq">(.*?)</span>') # 影片概述
findBd = re.compile(r'<p class="">(.*?)</p>', re.S) # 影片标签

# 爬取网页
def getData(baseUrl):
    datalist = []
    for i in range(0, 10):
        url = baseUrl + str(i * 25)
        html = askURL(url)  # 保存获取到的网页源码
        # 2.逐一解析数据
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('div', class_="item"):
            # print(item)
            data = [] # 保存一部电影的所有信息
            item = str(item)
            # 模式
            link = re.findall(findLink, item)[0]
            data.append(link)

            imgSrc =re.findall(findImgSrc, item)[0]
            data.append(imgSrc)

            titles = re.findall(findTitle, item)
            if(len(titles) == 2):
                ctitle = titles[0]
                data.append(ctitle)
                otitle = titles[1].replace("/", "")
                data.append(otitle)
            else:
                data.append(titles[0])
                data.append(" ")

            rate = re.findall(findRate, item)[0]
            data.append(rate)

            comNum = re.findall(findComNum, item)[0]
            data.append(comNum)

            inq = re.findall(findInq, item)
            if(len(inq) != 0):
                inq = inq[0].replace("。", "")
            else:
                inq = " "
            data.append(inq)

            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)
            bd = re.sub('/', " ", bd)
            data.append(bd.strip())
            datalist.append(data)
    # for data in datalist:
    #     print(data)
    return datalist

# 保存数据
def saveData(datalist, savePath):
    workbook = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook的对象,样式
    sheet = workbook.add_sheet('豆瓣电影Top250', cell_overwrite_ok = True) # 覆盖
    col = ('链接', '图片链接', '标题', '外国标题', '评分', '评价人数', '概述', '影片标签')

    for i in range(0, 8):
        sheet.write(0, i, col[i])
    for i in range(0, 250):
        data = datalist[i]
        for j in range(0, 8):
            sheet.write(i+1, j, data[j])
    workbook.save(savePath)

# 得到url的网页内容
def askURL(url):
    # 进行伪装,告诉豆瓣,我们的类型,接受什么样的数据
    headers = &#123;
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0"
    &#125;
    request = urllib.request.Request(url, headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        #print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


def main():
    baseUrl = "https://movie.douban.com/top250?start="
    # 1.爬取网页
    datalist = getData(baseUrl)
    mySavePath = "豆瓣电影Top250.xls"
    dbpath = 'movie.db'
    # 3.保存数据
    # saveData(datalist, mySavePath)
    save2db(datalist, dbpath)

def save2db(datalist, dbpath):
    # init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()
    for data in datalist:
        # 一行
        for index in range(len(data)):
            data[index] = '"' +data[index] +'"'
        sql = '''
            insert into movie250(
                info_link, pic_link, cname, ename, score, rate, introduction, info)
                values(%s)'''%",".join(data)
        cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()

def init_db(dbpath):

    sql = '''
        create table movie250(
            id integer primary key autoincrement,
            info_link text,
            pic_link text,
            cname varchar,
            ename varchar,
            score numeric,
            rate numeric,
            introduction text,
            info text
        )
    '''
    conn = sqlite3.connect(dbpath)
    db = conn.cursor()
    db.execute(sql)
    conn.commit()
    conn.close()
if __name__ == "__main__":
    main()
    # save2db("movie.db")
    print("爬取结束!")