Skip to content

Instantly share code, notes, and snippets.

@zenuo
Created September 27, 2018 15:58
Show Gist options
  • Select an option

  • Save zenuo/55ba76a62cd33c6080fa4407f40b2817 to your computer and use it in GitHub Desktop.

Select an option

Save zenuo/55ba76a62cd33c6080fa4407f40b2817 to your computer and use it in GitHub Desktop.
一个小说爬虫
package yz.gogo.util;
import com.fasterxml.jackson.core.type.TypeReference;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import yz.gogo.core.Constants;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.Collection;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
@Slf4j
public class NovelCrawler {
/**
* 目录的网址
*/
private static final String INDEX_URL = "https://www.myxs.net/3/3325/";
/**
* 最大重试次数
*/
private static final int MAX_RETRY_TIME = 100;
/**
* URL列表JSON
*/
private static final File URL_LIST_JSON_FILE = new File("urlList.json");
/**
* 输出文件
*/
private static final Path OUTPUT = Paths.get("output");
public static void main(String[] args) throws IOException {
//爬虫
new NovelCrawler().work();
}
/**
* 工作
*/
private void work() throws IOException {
//章节列表
final String[] urlList;
//若文件存在
if (URL_LIST_JSON_FILE.exists()) {
//从文件读取
urlList = Constants.MAPPER.readValue(URL_LIST_JSON_FILE, new TypeReference<String[]>() {
});
} else {
//从网页提取
urlList = this.getUrlList();
//保存至文件
Constants.MAPPER.writeValue(URL_LIST_JSON_FILE, urlList);
}
log.info("got url list");
//遍历章节列表
for (int i = 0; i < urlList.length; i++) {
//当前章节URL
String url = urlList[i];
//重试循环
for (int j = 1; j <= MAX_RETRY_TIME; j++) {
try {
//解析章节
final Chapter chapter = this.parse(url);
//日志
log.info("{} / {}", i + 1, urlList.length);
//追加到文件
Files.write(OUTPUT,
chapter.toString().getBytes(StandardCharsets.UTF_8),
StandardOpenOption.CREATE,
StandardOpenOption.APPEND,
StandardOpenOption.WRITE,
StandardOpenOption.SYNC);
//成功,终止重试循环
break;
} catch (IOException e) {
log.error("error {}", url, e);
}
//异常,等待并重试
try {
//超时秒数
int timeout = j * 2;
log.info("重试{}/{}, 等待{}s, {}", j, MAX_RETRY_TIME, timeout, url);
//等待
TimeUnit.SECONDS.sleep(timeout);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
/**
* 获取URL列表
*
* @return URL列表
*/
private String[] getUrlList() throws IOException {
Document document = Jsoup.connect(INDEX_URL)
.header("User-Agent", UserAgentUtils.get())
.get();
Elements zjlist4 = document.getElementsByClass("zjlist4");
return zjlist4.stream()
.map(element -> element.getElementsByTag("li"))
.map(elements -> elements.stream()
.map(li -> li.getElementsByTag("a").first().attr("href"))
.collect(Collectors.toList()))
.flatMap(Collection::stream)
.map(url -> INDEX_URL + url)
.toArray(String[]::new);
}
/**
* 解析章节
*
* @param url 章节URL
* @return 若解析成功,则返回章节示例;否则返回null
*/
private Chapter parse(String url) throws IOException {
Document document = Jsoup.connect(url)
.header("User-Agent", UserAgentUtils.get())
.get();
Element main = document.getElementsByClass("wrapper_main").first();
String title = main.getElementById("htmltimu").text();
String content = main.getElementById("htmlContent").text();
return Chapter.builder()
.title(title)
.content(content)
.build();
}
/**
* 章节类
*/
@Builder
@NoArgsConstructor
@AllArgsConstructor
private static class Chapter {
/**
* 模板
*/
private static final String TEMPLATE = "%s\r\n\r\n%s\r\n\r\n";
/**
* 标题
*/
String title;
/**
* 内容
*/
String content;
@Override
public String toString() {
return String.format(TEMPLATE, title, content);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment