Jsoup
开始
org.jsoup
jsoup
1.14.3
代理ip池:https://github.com/jhao104/proxy_pool
package com.ming.library.bookspider;
import com.ming.library.bookspider.service.BookSortService;
import com.ming.library.bookspider.service.BooksService;
import com.ming.library.bookspider.vo.DouBanBooks;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
@SpringBootTest
class BookSpiderApplicationTests {
@Autowired
private BookSortService bookSortService;
@Autowired
private BooksService booksService;
@Test
void test() throws IOException {
for(int i = 1; i < 20; i++) {
try {
downloadPageBook(i);
} catch (Exception e) {
e.printStackTrace();
}
}
}
public void downloadPageBook(long pageNum) throws IOException, InterruptedException {
Document pageDocument = Jsoup.connect("https://book.douban.com/latest?subcat=%E5%85%A8%E9%83%A8&p=" +pageNum)
// Document pageDocument = Jsoup.connect("http://www.mingyuefusu.cn")
// .proxy(
// "117.157.197.18", 8085
// )
// .proxy(
// new Proxy(
// Proxy.Type.HTTP,
// new InetSocketAddress("113.118.159.55", 9000 )
// )
// )
.userAgent(getRandomAgent())
.header("referer","https://www.baidu.com")
.get();
System.out.println(pageDocument.title());
Elements bookDatas = pageDocument.select(".media__body");
List<DouBanBooks> douBanBooksList = new LinkedList<DouBanBooks>();
for (Element bookData : bookDatas) {
DouBanBooks douBanBook = initDouBanBook();
Element bookNameElement = bookData.select("a.fleft").first();
String bookUrl = null;
if(bookNameElement != null) {
String bookName = bookNameElement.text();
bookUrl = bookNameElement.attr("abs:href");
douBanBook.setName(bookName);
}
Element bookPublishDataElement = bookData.select("p.subject-abstract").first();
if(bookPublishDataElement != null ) {
String authorName = bookPublishDataElement.text().split("/")[0].trim();
douBanBook.setAuthor(authorName);
}
// 详细信息
if(bookUrl != null) {
// https://book.douban.com/subject/35600427/
Thread.sleep(3000);
Document bookDetailDocument = Jsoup.connect(bookUrl).get();
// 简介
Element introElement = bookDetailDocument.select(".intro").first();
if(introElement != null) {
Element allHrefElement = introElement.select(".a_show_full").last();
if(allHrefElement != null) {
allHrefElement.remove();
Element emptyMoreElement = introElement.select("p").last();
if( emptyMoreElement != null && StringUtils.isBlank(emptyMoreElement.text())) {
emptyMoreElement.remove();
}
}
douBanBook.setDescription(introElement.html());
}
// 分类
Element oneTagElement = bookDetailDocument.select("div.indent > span > .tag").first();
if(oneTagElement != null) {
douBanBook.setSortName(oneTagElement.text());
}
}
System.out.println(douBanBook);
douBanBooksList.add(douBanBook);
}
for (DouBanBooks douBanBook : douBanBooksList) {
booksService.saveBookAndAddSort(douBanBook);
}
Thread.sleep(3000);
}
private String getRandomAgent() {
String[] ua = {"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7",
"Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/60.0"};
return ua[ new Random().nextInt(ua.length) ];
}
public DouBanBooks initDouBanBook() {
DouBanBooks douBanBook = new DouBanBooks();
douBanBook.setName("书本");
douBanBook.setAuthor("匿名");
douBanBook.setSortName("未命名");
douBanBook.setDescription("无简介");
return douBanBook;
}
}