首页 技术 正文
技术 2022年11月15日
0 收藏 880 点赞 4,233 浏览 2677 个字

爬取煎蛋网

1、找出页面网址的规律

2、设计页面图片网址的正则

代码:

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class SpiderTest { private static ArrayList<String> urlStrs = new ArrayList<String>();
private static String regx = "\"[\\S]*\\.(jpg|gif)"; //读取jpg和gif图片的正则
private static int num = 0; //图片名递增量 public static void main(String[] args)throws Exception{
//String urlStr = "http://jandan.net/ooxx/page-2381#comments"; //要抓取的煎蛋妹子网页示例
String urlStr="";
String dstDir = "d:/dstDir";
int start = 2340; //起始页
int end = 2370; //结束页 for(int i=start;i<=end;i++){
urlStr = "http://jandan.net/ooxx/page-"+i+"#comments";
matchAll(urlStr);
if(urlStrs.size() > 0){
for(String imgStr:urlStrs){
downFile(imgStr,dstDir);
Thread.sleep(300); //休息一会
}
}
urlStrs.clear();
}
System.out.println("网址抓取完毕");
}
/*
* @param:urlStr 要爬取的网址
*/
private static void matchAll(String urlStr)throws Exception{
Pattern p = Pattern.compile(regx);
Matcher m;
URL url;
try {
url = new URL(urlStr);
} catch (MalformedURLException e) {
throw new Exception("网址不存在");
} BufferedReader read= new BufferedReader(new InputStreamReader(url.openStream()));
String line = "";
while((line = read.readLine()) != null){
m = p.matcher(line);
while(m.find()){
System.out.println(m.group());
urlStrs.add("http:"+m.group().substring(1)); //将图片网址添加到ArrayList(过滤第一个双引号)
}
}
read.close();
}
/*下载指定图片网址的图片
* @param:urlStr 图片网址
* @param:dstDir 图片存放目录
*/
private static void downFile(String urlStr,String dstDir)throws Exception{
byte[] bBuf = new byte[1024];
File dir = new File(dstDir);
String fileName = "";
if(!dir.exists()){
dir.mkdir();
}
if(urlStr.endsWith("jpg")){
fileName = (num++) + ".jpg";
}else if(urlStr.endsWith("gif")){
fileName = (num++) + ".gif";
}
File imgFile = new File(dstDir,fileName);
//if(imgFile.exists()){
// TODO..
//}
URL url = new URL(urlStr);
BufferedInputStream in = new BufferedInputStream(url.openStream());
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(imgFile)); System.out.println("开始下载。。");
int len = 0;
while((len = in.read(bBuf)) != -1){
out.write(bBuf,0,len);
}
System.out.println("下载完毕");
in.close();
out.close();
}
/*
* 获取网页源码(此方法没有使用)
*/
private void getSourceCode(String u)throws Exception{
//String u = "http://m.onepiece.cc/post/10001/";
File f = new File("d:/tmp.txt");
if(!f.exists()){
f.createNewFile();
}
URL url = new URL(u);
BufferedReader read = new BufferedReader(new InputStreamReader(url.openStream()));
BufferedWriter write = new BufferedWriter(new FileWriter(f));
String s = "";
while((s=read.readLine()) != null){
write.write(s);
write.write('\n');
}
System.out.println("拷贝完成");
read.close();
write.close();
}
}

java小爬虫

相关推荐
python开发_常用的python模块及安装方法
adodb:我们领导推荐的数据库连接组件bsddb3:BerkeleyDB的连接组件Cheetah-1.0:我比较喜欢这个版本的cheeta…
日期:2022-11-24 点赞:878 阅读:8,996
Educational Codeforces Round 11 C. Hard Process 二分
C. Hard Process题目连接:http://www.codeforces.com/contest/660/problem/CDes…
日期:2022-11-24 点赞:807 阅读:5,510
下载Ubuntn 17.04 内核源代码
zengkefu@server1:/usr/src$ uname -aLinux server1 4.10.0-19-generic #21…
日期:2022-11-24 点赞:569 阅读:6,353
可用Active Desktop Calendar V7.86 注册码序列号
可用Active Desktop Calendar V7.86 注册码序列号Name: www.greendown.cn Code: &nb…
日期:2022-11-24 点赞:733 阅读:6,137
Android调用系统相机、自定义相机、处理大图片
Android调用系统相机和自定义相机实例本博文主要是介绍了android上使用相机进行拍照并显示的两种方式,并且由于涉及到要把拍到的照片显…
日期:2022-11-24 点赞:512 阅读:7,770
Struts的使用
一、Struts2的获取  Struts的官方网站为:http://struts.apache.org/  下载完Struts2的jar包,…
日期:2022-11-24 点赞:671 阅读:4,848