zenuo · September 27, 2018 15:58
diff --git a/NovelCrawler.java b/NovelCrawler.java
 package yz.gogo.util;

 import com.fasterxml.jackson.core.type.TypeReference;
 import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.NoArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
 import yz.gogo.core.Constants;

 import java.io.File;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.StandardOpenOption;
 import java.util.Collection;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;

 @Slf4j
 public class NovelCrawler {
    /**
     * 目录的网址
     */
    private static final String INDEX_URL = "https://www.myxs.net/3/3325/";

    /**
     * 最大重试次数
     */
    private static final int MAX_RETRY_TIME = 100;

    /**
     * URL列表JSON
     */
    private static final File URL_LIST_JSON_FILE = new File("urlList.json");

    /**
     * 输出文件
     */
    private static final Path OUTPUT = Paths.get("output");


    public static void main(String[] args) throws IOException {
        //爬虫
        new NovelCrawler().work();
    }

    /**
     * 工作
     */
    private void work() throws IOException {
        //章节列表
        final String[] urlList;
        //若文件存在
        if (URL_LIST_JSON_FILE.exists()) {
            //从文件读取
            urlList = Constants.MAPPER.readValue(URL_LIST_JSON_FILE, new TypeReference<String[]>() {
            });
        } else {
            //从网页提取
            urlList = this.getUrlList();
            //保存至文件
            Constants.MAPPER.writeValue(URL_LIST_JSON_FILE, urlList);
        }
        log.info("got url list");
        //遍历章节列表
        for (int i = 0; i < urlList.length; i++) {
            //当前章节URL
            String url = urlList[i];
            //重试循环
            for (int j = 1; j <= MAX_RETRY_TIME; j++) {
                try {
                    //解析章节
                    final Chapter chapter = this.parse(url);
                    //日志
                    log.info("{} / {}", i + 1, urlList.length);
                    //追加到文件
                    Files.write(OUTPUT,
                            chapter.toString().getBytes(StandardCharsets.UTF_8),
                            StandardOpenOption.CREATE,
                            StandardOpenOption.APPEND,
                            StandardOpenOption.WRITE,
                            StandardOpenOption.SYNC);
                    //成功，终止重试循环
                    break;
                } catch (IOException e) {
                    log.error("error {}", url, e);
                }
                //异常，等待并重试
                try {
                    //超时秒数
                    int timeout = j * 2;
                    log.info("重试{}/{}, 等待{}s, {}", j, MAX_RETRY_TIME, timeout, url);
                    //等待
                    TimeUnit.SECONDS.sleep(timeout);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * 获取URL列表
     *
     * @return URL列表
     */
    private String[] getUrlList() throws IOException {
        Document document = Jsoup.connect(INDEX_URL)
                .header("User-Agent", UserAgentUtils.get())
                .get();
        Elements zjlist4 = document.getElementsByClass("zjlist4");
        return zjlist4.stream()
                .map(element -> element.getElementsByTag("li"))
                .map(elements -> elements.stream()
                        .map(li -> li.getElementsByTag("a").first().attr("href"))
                        .collect(Collectors.toList()))
                .flatMap(Collection::stream)
                .map(url -> INDEX_URL + url)
                .toArray(String[]::new);
    }

    /**
     * 解析章节
     *
     * @param url 章节URL
     * @return 若解析成功，则返回章节示例；否则返回null
     */
    private Chapter parse(String url) throws IOException {
        Document document = Jsoup.connect(url)
                .header("User-Agent", UserAgentUtils.get())
                .get();
        Element main = document.getElementsByClass("wrapper_main").first();
        String title = main.getElementById("htmltimu").text();
        String content = main.getElementById("htmlContent").text();
        return Chapter.builder()
                .title(title)
                .content(content)
                .build();
    }

    /**
     * 章节类
     */
    @Builder
    @NoArgsConstructor
    @AllArgsConstructor
    private static class Chapter {

        /**
         * 模板
         */
        private static final String TEMPLATE = "%s\r\n\r\n%s\r\n\r\n";

        /**
         * 标题
         */
        String title;

        /**
         * 内容
         */
        String content;

        @Override
        public String toString() {
            return String.format(TEMPLATE, title, content);
        }
    }
 }
	package yz.gogo.util;

	import com.fasterxml.jackson.core.type.TypeReference;
	import lombok.AllArgsConstructor;
	import lombok.Builder;
	import lombok.NoArgsConstructor;
	import lombok.extern.slf4j.Slf4j;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.nodes.Element;
	import org.jsoup.select.Elements;
	import yz.gogo.core.Constants;

	import java.io.File;
	import java.io.IOException;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.nio.file.StandardOpenOption;
	import java.util.Collection;
	import java.util.concurrent.TimeUnit;
	import java.util.stream.Collectors;

	@Slf4j
	public class NovelCrawler {
	/**
	* 目录的网址
	*/
	private static final String INDEX_URL = "https://www.myxs.net/3/3325/";

	/**
	* 最大重试次数
	*/
	private static final int MAX_RETRY_TIME = 100;

	/**
	* URL列表JSON
	*/
	private static final File URL_LIST_JSON_FILE = new File("urlList.json");

	/**
	* 输出文件
	*/
	private static final Path OUTPUT = Paths.get("output");


	public static void main(String[] args) throws IOException {
	//爬虫
	new NovelCrawler().work();
	}

	/**
	* 工作
	*/
	private void work() throws IOException {
	//章节列表
	final String[] urlList;
	//若文件存在
	if (URL_LIST_JSON_FILE.exists()) {
	//从文件读取
	urlList = Constants.MAPPER.readValue(URL_LIST_JSON_FILE, new TypeReference<String[]>() {
	});
	} else {
	//从网页提取
	urlList = this.getUrlList();
	//保存至文件
	Constants.MAPPER.writeValue(URL_LIST_JSON_FILE, urlList);
	}
	log.info("got url list");
	//遍历章节列表
	for (int i = 0; i < urlList.length; i++) {
	//当前章节URL
	String url = urlList[i];
	//重试循环
	for (int j = 1; j <= MAX_RETRY_TIME; j++) {
	try {
	//解析章节
	final Chapter chapter = this.parse(url);
	//日志
	log.info("{} / {}", i + 1, urlList.length);
	//追加到文件
	Files.write(OUTPUT,
	chapter.toString().getBytes(StandardCharsets.UTF_8),
	StandardOpenOption.CREATE,
	StandardOpenOption.APPEND,
	StandardOpenOption.WRITE,
	StandardOpenOption.SYNC);
	//成功，终止重试循环
	break;
	} catch (IOException e) {
	log.error("error {}", url, e);
	}
	//异常，等待并重试
	try {
	//超时秒数
	int timeout = j * 2;
	log.info("重试{}/{}, 等待{}s, {}", j, MAX_RETRY_TIME, timeout, url);
	//等待
	TimeUnit.SECONDS.sleep(timeout);
	} catch (InterruptedException e) {
	e.printStackTrace();
	}
	}
	}
	}

	/**
	* 获取URL列表
	*
	* @return URL列表
	*/
	private String[] getUrlList() throws IOException {
	Document document = Jsoup.connect(INDEX_URL)
	.header("User-Agent", UserAgentUtils.get())
	.get();
	Elements zjlist4 = document.getElementsByClass("zjlist4");
	return zjlist4.stream()
	.map(element -> element.getElementsByTag("li"))
	.map(elements -> elements.stream()
	.map(li -> li.getElementsByTag("a").first().attr("href"))
	.collect(Collectors.toList()))
	.flatMap(Collection::stream)
	.map(url -> INDEX_URL + url)
	.toArray(String[]::new);
	}

	/**
	* 解析章节
	*
	* @param url 章节URL
	* @return 若解析成功，则返回章节示例；否则返回null
	*/
	private Chapter parse(String url) throws IOException {
	Document document = Jsoup.connect(url)
	.header("User-Agent", UserAgentUtils.get())
	.get();
	Element main = document.getElementsByClass("wrapper_main").first();
	String title = main.getElementById("htmltimu").text();
	String content = main.getElementById("htmlContent").text();
	return Chapter.builder()
	.title(title)
	.content(content)
	.build();
	}

	/**
	* 章节类
	*/
	@Builder
	@NoArgsConstructor
	@AllArgsConstructor
	private static class Chapter {

	/**
	* 模板
	*/
	private static final String TEMPLATE = "%s\r\n\r\n%s\r\n\r\n";

	/**
	* 标题
	*/
	String title;

	/**
	* 内容
	*/
	String content;

	@Override
	public String toString() {
	return String.format(TEMPLATE, title, content);
	}
	}
	}
No results found