jsoup提取连接下载网站图片

jsoup提取连接下载网站图片

所使用的包分别为commons-httpclient.jar和jsoup-1.6.1.jar; 
利用jsoup提取,页面src路径; 
利用 httpclient下载网站图片

1. [代码]jsoup提取src路径,下载网站图片     

01

public class DownImages {

02

    private static int COUNT = 0;

 

03

    private static int DOWN_COUNT = 0;

04

     

 

05

    public static void jsoupHTML(String urlPath) throws Exception{

06

        Document doc = Jsoup.connect(urlPath).timeout(1000000).get();

 

07

        //:当前页中的图片

08

        Elements srcLinks = doc.select("img[src$=.jpg]");

 

09

        for (Element link : srcLinks) {

10

            //:剔除标签,只剩链接路径

 

11

            String imagesPath = link.attr("src");

12

            System.out.println("当前访问路径:"+imagesPath);

 

13

            getImages(imagesPath, "d://images//0000"+ ++COUNT +".jpg");

14

        }

 

15

         

16

        //:提取网站中所有的href连接

 

17

        Elements linehrefs = doc.select("a[href]");

18

         

 

19

        for (Element linehref : linehrefs) {

20

            String lihr = linehref.attr("href");

 

21

            if(lihr.length()>4){

22

                String ht = lihr.substring(0, 4);

 

23

                String htt = lihr.substring(0, 1);

24

                if(!ht.equals("http") && htt.equals("/")){

 

25

                    lihr = urlPath + lihr;

26

                }

 

27

                if(lihr.substring(0, 4).equals("http")){

28

                    Document docs = Jsoup.connect(lihr).timeout(1000000).get();

 

29

                    Elements links = docs.select("img[src$=.jpg]");

30

                    for (Element link : links) {

 

31

                        //:剔除标签,只剩链接路径

32

                        String imagesPath = link.attr("src");

 

33

                        System.out.println("当前访问路径:"+imagesPath);

34

                        getImages(imagesPath, "d://images//0000"+ COUNT++ +".jpg");

 

35

                    }

36

                }

 

37

            }

38

        }

 

39

    }

40

     

 

41

     

42

    /**

 

43

     * @param urlPath 图片路径

44

     * @throws Exception

 

45

     */

46

    public static void getImages(String urlPath,String fileName) throws Exception{

 

47

        URL url = new URL(urlPath);//:获取的路径

48

        //:http协议连接对象

 

49

        HttpURLConnection conn = (HttpURLConnection) url.openConnection();

50

        conn.setRequestMethod("GET");

 

51

        conn.setReadTimeout(6 * 10000);

52

        if (conn.getResponseCode() <10000){

 

53

            InputStream inputStream = conn.getInputStream();

54

            byte[] data = readStream(inputStream);

 

55

            if(data.length>(1024*10)){

56

                FileOutputStream outputStream = new FileOutputStream(fileName);

 

57

                outputStream.write(data);

58

                System.err.println("第"+ ++DOWN_COUNT +"图片下载成功");

 

59

                outputStream.close();

60

            }

 

61

        }

62

         

 

63

    }

64

     

 

65

    /**

66

     * 读取url中数据,并以字节的形式返回

 

67

     * @param inputStream

68

     * @return

 

69

     * @throws Exception

70

     */

 

71

    public static byte[] readStream(InputStream inputStream) throws Exception{

72

        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

 

73

        byte[] buffer = new byte[1024];

74

        int len = -1;

 

75

        while((len = inputStream.read(buffer)) !=-1){

76

            outputStream.write(buffer, 0, len);

 

77

        }

78

        outputStream.close();

 

79

        inputStream.close();

80

        return outputStream.toByteArray();

 

81

    }

82

     

 

83

    public static void main(String[] args) {

84

        try {

 

85

            String urlPath = "http://www.22mm.cc/";

86

            jsoupHTML(urlPath);

 

87

        } catch (Exception e) {

88

            e.printStackTrace();

 

89

        }finally{

90

            System.out.println("共访问"+COUNT+"张图片,其中下载"+DOWN_COUNT+"张图片");

 

91

        }

92

    }

 

93

}

 

;