Created
September 27, 2018 15:58
-
-
Save zenuo/55ba76a62cd33c6080fa4407f40b2817 to your computer and use it in GitHub Desktop.
一个小说爬虫
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package yz.gogo.util; | |
| import com.fasterxml.jackson.core.type.TypeReference; | |
| import lombok.AllArgsConstructor; | |
| import lombok.Builder; | |
| import lombok.NoArgsConstructor; | |
| import lombok.extern.slf4j.Slf4j; | |
| import org.jsoup.Jsoup; | |
| import org.jsoup.nodes.Document; | |
| import org.jsoup.nodes.Element; | |
| import org.jsoup.select.Elements; | |
| import yz.gogo.core.Constants; | |
| import java.io.File; | |
| import java.io.IOException; | |
| import java.nio.charset.StandardCharsets; | |
| import java.nio.file.Files; | |
| import java.nio.file.Path; | |
| import java.nio.file.Paths; | |
| import java.nio.file.StandardOpenOption; | |
| import java.util.Collection; | |
| import java.util.concurrent.TimeUnit; | |
| import java.util.stream.Collectors; | |
| @Slf4j | |
| public class NovelCrawler { | |
| /** | |
| * 目录的网址 | |
| */ | |
| private static final String INDEX_URL = "https://www.myxs.net/3/3325/"; | |
| /** | |
| * 最大重试次数 | |
| */ | |
| private static final int MAX_RETRY_TIME = 100; | |
| /** | |
| * URL列表JSON | |
| */ | |
| private static final File URL_LIST_JSON_FILE = new File("urlList.json"); | |
| /** | |
| * 输出文件 | |
| */ | |
| private static final Path OUTPUT = Paths.get("output"); | |
| public static void main(String[] args) throws IOException { | |
| //爬虫 | |
| new NovelCrawler().work(); | |
| } | |
| /** | |
| * 工作 | |
| */ | |
| private void work() throws IOException { | |
| //章节列表 | |
| final String[] urlList; | |
| //若文件存在 | |
| if (URL_LIST_JSON_FILE.exists()) { | |
| //从文件读取 | |
| urlList = Constants.MAPPER.readValue(URL_LIST_JSON_FILE, new TypeReference<String[]>() { | |
| }); | |
| } else { | |
| //从网页提取 | |
| urlList = this.getUrlList(); | |
| //保存至文件 | |
| Constants.MAPPER.writeValue(URL_LIST_JSON_FILE, urlList); | |
| } | |
| log.info("got url list"); | |
| //遍历章节列表 | |
| for (int i = 0; i < urlList.length; i++) { | |
| //当前章节URL | |
| String url = urlList[i]; | |
| //重试循环 | |
| for (int j = 1; j <= MAX_RETRY_TIME; j++) { | |
| try { | |
| //解析章节 | |
| final Chapter chapter = this.parse(url); | |
| //日志 | |
| log.info("{} / {}", i + 1, urlList.length); | |
| //追加到文件 | |
| Files.write(OUTPUT, | |
| chapter.toString().getBytes(StandardCharsets.UTF_8), | |
| StandardOpenOption.CREATE, | |
| StandardOpenOption.APPEND, | |
| StandardOpenOption.WRITE, | |
| StandardOpenOption.SYNC); | |
| //成功,终止重试循环 | |
| break; | |
| } catch (IOException e) { | |
| log.error("error {}", url, e); | |
| } | |
| //异常,等待并重试 | |
| try { | |
| //超时秒数 | |
| int timeout = j * 2; | |
| log.info("重试{}/{}, 等待{}s, {}", j, MAX_RETRY_TIME, timeout, url); | |
| //等待 | |
| TimeUnit.SECONDS.sleep(timeout); | |
| } catch (InterruptedException e) { | |
| e.printStackTrace(); | |
| } | |
| } | |
| } | |
| } | |
| /** | |
| * 获取URL列表 | |
| * | |
| * @return URL列表 | |
| */ | |
| private String[] getUrlList() throws IOException { | |
| Document document = Jsoup.connect(INDEX_URL) | |
| .header("User-Agent", UserAgentUtils.get()) | |
| .get(); | |
| Elements zjlist4 = document.getElementsByClass("zjlist4"); | |
| return zjlist4.stream() | |
| .map(element -> element.getElementsByTag("li")) | |
| .map(elements -> elements.stream() | |
| .map(li -> li.getElementsByTag("a").first().attr("href")) | |
| .collect(Collectors.toList())) | |
| .flatMap(Collection::stream) | |
| .map(url -> INDEX_URL + url) | |
| .toArray(String[]::new); | |
| } | |
| /** | |
| * 解析章节 | |
| * | |
| * @param url 章节URL | |
| * @return 若解析成功,则返回章节示例;否则返回null | |
| */ | |
| private Chapter parse(String url) throws IOException { | |
| Document document = Jsoup.connect(url) | |
| .header("User-Agent", UserAgentUtils.get()) | |
| .get(); | |
| Element main = document.getElementsByClass("wrapper_main").first(); | |
| String title = main.getElementById("htmltimu").text(); | |
| String content = main.getElementById("htmlContent").text(); | |
| return Chapter.builder() | |
| .title(title) | |
| .content(content) | |
| .build(); | |
| } | |
| /** | |
| * 章节类 | |
| */ | |
| @Builder | |
| @NoArgsConstructor | |
| @AllArgsConstructor | |
| private static class Chapter { | |
| /** | |
| * 模板 | |
| */ | |
| private static final String TEMPLATE = "%s\r\n\r\n%s\r\n\r\n"; | |
| /** | |
| * 标题 | |
| */ | |
| String title; | |
| /** | |
| * 内容 | |
| */ | |
| String content; | |
| @Override | |
| public String toString() { | |
| return String.format(TEMPLATE, title, content); | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment