java爬取网页源码,并下载网页文件到指定目录。

小熊博客
2021-05-20 / 0 评论 / 291 阅读
温馨提示:
本文最后更新于2021-05-20,若内容或图片失效,请留言反馈。

引入java的HTML解析器jsoup。

jar引用

jar下载地址

maven引用

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
       <version>1.13.1</version>
    </dependency>

以爬取我个人网站首页图片为例

我们先打开浏览器调试模式查看需要爬取的图片,很明显图片地址放在div标签data-original属性内

image

java代码

    public static void main(String[] args) {
        String html = getHtmlFromUrl("https://www.xtyu.top/","utf-8");//下载html源码
        Document doc = Jsoup.parse(html);
        Elements elements = doc.select("div[data-original]");//找到对应的所有标签 ps:工具网页标签类型不同灵活配置
        //遍历出所有data-original
        for(Element e : elements){
            String url =e.attr("data-original");
            System.out.println(url);
            //如果资源是相对路径则拼接一下url
            if (url.indexOf("http")==-1){
                url = url.replace("../","https://www.xtyu.top/");
            }
            //下载到指定目录
            download(url,"/Users/smile/Desktop/测试爬取/");
        }
    }


    //根据网址返回网页源代码
    public static String getHtmlFromUrl(String url,String encoding){
        StringBuffer html = new StringBuffer();
        InputStreamReader isr=null;
        BufferedReader buf=null;
        String str = null;
        try {
            URL urlObj = new URL(url);
            URLConnection con = urlObj.openConnection();
            isr = new InputStreamReader(con.getInputStream(),encoding);
            buf = new BufferedReader(isr);
            while((str=buf.readLine()) != null){
                html.append(str+"\n");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }finally{
            if(isr != null){
                try {
                    buf.close();
                    isr.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return html.toString();
    }

    //下载文件到指定目录
    public static void download(String url,String path){
        File file= null;
        FileOutputStream fos=null;
        String downloadName= url.substring(url.lastIndexOf("/")+1);
        HttpURLConnection httpCon = null;
        URLConnection  con = null;
        URL urlObj=null;
        InputStream in =null;
        byte[] size = new byte[1024];
        int num=0;
        try {
            file = new File(path+downloadName);
            fos = new FileOutputStream(file);
            if(url.startsWith("http")){
                urlObj = new URL(url);
                con = urlObj.openConnection();
                httpCon =(HttpURLConnection) con;
                in = httpCon.getInputStream();
                while((num=in.read(size)) != -1){
                    for(int i=0;i<num;i++)
                        fos.write(size[i]);
                }
            }

        } catch (Exception e) {
            e.printStackTrace();
        } finally{
            try {
                in.close();
                fos.close();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

结果
image.png