Jsoup

开始



    org.jsoup
    jsoup
    1.14.3

https://jsoup.org/

代理ip池:https://github.com/jhao104/proxy_pool

package com.ming.library.bookspider;

import com.ming.library.bookspider.service.BookSortService;
import com.ming.library.bookspider.service.BooksService;
import com.ming.library.bookspider.vo.DouBanBooks;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;

@SpringBootTest
class BookSpiderApplicationTests {

    @Autowired
    private BookSortService bookSortService;

    @Autowired
    private BooksService booksService;


    @Test
    void test() throws IOException {
        for(int i = 1; i < 20; i++) &#123;
            try &#123;
                downloadPageBook(i);
            &#125; catch (Exception e) &#123;
                e.printStackTrace();
            &#125;
        &#125;
    &#125;

    public void downloadPageBook(long pageNum) throws IOException, InterruptedException &#123;
        Document pageDocument = Jsoup.connect("https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p=" +pageNum)
//        Document pageDocument = Jsoup.connect("http://www.mingyuefusu.cn")
//                .proxy(
//                        "117.157.197.18", 8085
//                )
//                .proxy(
//                        new Proxy(
//                            Proxy.Type.HTTP,
//                            new InetSocketAddress("113.118.159.55", 9000 )
//                        )
//                )
                .userAgent(getRandomAgent())
                .header("referer","https://www.baidu.com")
                .get();
        System.out.println(pageDocument.title());
        Elements bookDatas = pageDocument.select(".media__body");
        List<DouBanBooks> douBanBooksList = new LinkedList<DouBanBooks>();
        for (Element bookData : bookDatas) &#123;
            DouBanBooks douBanBook = initDouBanBook();
            Element bookNameElement = bookData.select("a.fleft").first();
            String bookUrl = null;
            if(bookNameElement != null) &#123;
                String bookName = bookNameElement.text();
                bookUrl = bookNameElement.attr("abs:href");
                douBanBook.setName(bookName);
            &#125;
            Element bookPublishDataElement = bookData.select("p.subject-abstract").first();
            if(bookPublishDataElement != null ) &#123;
                String authorName = bookPublishDataElement.text().split("/")[0].trim();
                douBanBook.setAuthor(authorName);
            &#125;
            // 详细信息
            if(bookUrl != null) &#123;
                // https://book.douban.com/subject/35600427/
                Thread.sleep(3000);
                Document bookDetailDocument = Jsoup.connect(bookUrl).get();
                // 简介
                Element introElement = bookDetailDocument.select(".intro").first();
                if(introElement != null) &#123;
                    Element allHrefElement = introElement.select(".a_show_full").last();
                    if(allHrefElement != null) &#123;
                        allHrefElement.remove();
                        Element emptyMoreElement = introElement.select("p").last();
                        if( emptyMoreElement != null && StringUtils.isBlank(emptyMoreElement.text())) &#123;
                            emptyMoreElement.remove();
                        &#125;
                    &#125;
                    douBanBook.setDescription(introElement.html());
                &#125;
                // 分类
                Element oneTagElement = bookDetailDocument.select("div.indent > span > .tag").first();
                if(oneTagElement != null) &#123;
                    douBanBook.setSortName(oneTagElement.text());
                &#125;
            &#125;
            System.out.println(douBanBook);
            douBanBooksList.add(douBanBook);
        &#125;
        for (DouBanBooks douBanBook : douBanBooksList) &#123;
            booksService.saveBookAndAddSort(douBanBook);
        &#125;
        Thread.sleep(3000);
    &#125;

    private String getRandomAgent() &#123;
        String[] ua = &#123;"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
                "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
                "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
                "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"&#125;;
        return ua[ new Random().nextInt(ua.length) ];
    &#125;

    public DouBanBooks initDouBanBook() &#123;
        DouBanBooks douBanBook = new DouBanBooks();
        douBanBook.setName("书本");
        douBanBook.setAuthor("匿名");
        douBanBook.setSortName("未命名");
        douBanBook.setDescription("无简介");
        return douBanBook;
    &#125;



&#125;