JAVA编写网络爬虫笔记（第二部分:httpClient下载页面）

接着上一部分的内容，我们看一下怎样下载和解析页面。这里用到的java包主要是httpclient，可以去apache官网下载我们是利用httpclient生成一个GetMethod的对象，这个对象可以请求网页，然后网页回应html源代码给我们，我们就可以保存为一个html文件或者txt文件，然后就进行下一步的信息提取了。主要的代码实现import java.io.DataOutputStream

sunyuan_software

1055人浏览 · 2015-05-29 08:16:26

sunyuan_software · 2015-05-29 08:16:26 发布

接着上一部分的内容，我们看一下怎样下载和解析页面。这里用到的java包主要是httpclient，可以去apache官网下载

我们是利用httpclient生成一个GetMethod的对象，这个对象可以请求网页，然后网页回应html源代码给我们，我们就可以保存为一个html文件或者txt文件，然后就进行下一步的信息提取了。

主要的代码实现

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;



    public class DownLoadFile {

        /**
         * 根据URL和网页类型生成需要保存的网页的文件名，去除URL中的非文件名字符
         * */
    public String getFileNameByUrl(String url,String contentType){
            //移除http:
            url=url.substring(7);
            //text/html类型
            if(contentType.indexOf("html")!=-1){
                url=url.replaceAll("[\\?/:*|<>\"]","_")+".html";
                return url;
            }
            else
            {
                return url.replaceAll("[\\?:*|<>\"]","_")+"."+contentType.substring(contentType.lastIndexOf("/")+1);
            }
        }
        /**
         * 保存网页字节数组到本地文件,filepath为要保存文件的相对地址
         * */
        private void saveToLocal(String context ,String filePath){
            try {   

                File file = new File(filePath);
                if(!file.exists()){
                    file.createNewFile();}
             PrintStream ps = new PrintStream(new FileOutputStream(file));
             ps.append(context);}
             catch (FileNotFoundException e) {
                // TODO 自动生成的 catch 块
                e.printStackTrace();
            } catch (IOException e) {
                // TODO 自动生成的 catch 块
                e.printStackTrace();
            }

        }


        /**
         * 下载url所指向的网页
         * */
        public String downloadFile(String url){
            String filePath =null;
            String filePath1=null;
            //生成HttpClient对象并设置参数
            HttpClient httpClient=new HttpClient();
            //2.生成GetMethod对象并设置参数
            GetMethod getMethod =new GetMethod(url);
              httpClient.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET,"gb2312");

            //3.执行http get请求
    try {
                int statusCode=httpClient.executeMethod(getMethod);
                //判断访问的状态码
                if(statusCode!=HttpStatus.SC_OK){
                    System.out.println("Method failed:"+getMethod.getStatusLine());
                    filePath=null;
                }
                //处理HTTP内容
            String  responseBody=getMethod.getResponseBodyAsString();
                //根据网页url生成保存时的文件名
                filePath="C:"+File.separator+"test1"+File.separator+"+getFileNameByUrl(url,getMethod.getResponseHeader('Content-Type').getValue())";

                saveToLocal(responseBody,filePath);

            } catch (HttpException e) {
                // 发生异常，可能是协议不对或者返回的内容有问题
                System.out.println("Please check your provided http address!");
                e.printStackTrace();
            } catch (IOException e) {   
                // 发生网络异常
                e.printStackTrace();
            }finally{
                //释放链接
                getMethod.releaseConnection();
            }
            return filePath;

        }

    }