1
waytoexplorer 2016-11-02 09:44:28 +08:00
这个 sessionid 可能会定时变更,爬取的时候也要相应更改
|
2
CharlesL 2016-11-02 10:23:37 +08:00
把模拟登录之后的 cookie 也给 webmagic 带上试试
|
3
Jobin0528 OP @waytoexplorer 请问这个怎么操作?
|
5
joechan 2016-11-02 12:20:42 +08:00
应该是你带上的 sessionid 还是每次都不同的
|
6
winglight2016 2016-11-02 15:31:50 +08:00
你用浏览器访问一下,看看登录请求及返回的所有 header 和 response ,肯定是漏了什么地方没模拟对
|
7
Jobin0528 OP ```
@RequestMapping(value = "doGrab",method = RequestMethod.POST) public String doGrab(String username, String password, HttpServletRequest request){ try { String cookie = simulationHttpUtil.getCookie(username,password); String cookies[] = cookie.split("="); webMagicUtil.setSite(cookies[1]); webMagicUtil.setCook(cookie); Spider.create(webMagicUtil) //从该网页开始抓 .addUrl("http://www.digifilm.com.cn/index.php/member/index") .addPipeline(new ConsolePipeline()) //开启 5 个线程抓取 .thread(5) //启动爬虫 .run(); System.out.print(webMagicUtil.getLength()); } catch (Exception e) { e.printStackTrace(); } @Component //给爬虫供给 cookie 的方法 public String getCookie (String username,String password) throws Exception { RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();//标准 cookie 策略 /*.STANDARD_STRICT*/ CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();//设置进去 HttpGet getHomePage = new HttpGet("http://www.digifilm.com.cn/index.php/public/login"); getHomePage.setHeader("Accept","text/html,application/xhtml+xml,image/jxr,*/*"); getHomePage.setHeader("Accept-Encoding","gzip,deflate"); getHomePage.setHeader("Accept-Language","zh-CN"); getHomePage.setHeader("Connection","Keep-Alive"); getHomePage.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"); //填充登陆请求中基本的参数 CloseableHttpResponse response = httpClient.execute(getHomePage); String rec = setCookie(response); //printResponse(response); //首页的源码 String responseHtml = EntityUtils.toString(response.getEntity()); //首页中的 html 代码<input name="__hash__" type="hidden" value=""/> String hashValue = responseHtml.split("<input type=\"hidden\" name=\"__hash__\" value=\"")[1].split("\" />")[0]; response.close(); List<NameValuePair> valuePairs = new LinkedList<NameValuePair>(); valuePairs.add(new BasicNameValuePair("__hash__" , hashValue)); valuePairs.add(new BasicNameValuePair("password", password)); valuePairs.add(new BasicNameValuePair("username", username)); while (true){ //获取验证码" HttpGet getCaptcha = new HttpGet("http://www.digifilm.com.cn/index.php/Verify/verify/?rand=" + Math.random()); CloseableHttpResponse imageResponse = httpClient.execute(getCaptcha); //把响应的 png 格式图片转换成 jpg 格式。 InputStream in = imageResponse.getEntity().getContent(); BufferedImage bufferedImage = imageUtil.imageChange(in); imageResponse.close(); in.close(); //图片去噪 File file = imageUtil.cleanImage(bufferedImage); //识别去噪后的图片 String text = scanCodeUtil.recognizeText(file); System.out.println("扫描后的图片:"+text); valuePairs.add(new BasicNameValuePair("verify", text)); //完成登陆请求的构造 UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8); HttpPost post = new HttpPost("http://www.digifilm.com.cn/index.php/public/checklogin"); post.setEntity(entity); CloseableHttpResponse httpResponse = httpClient.execute(post);//登录并返回响应对象 httpResponse.close(); //构造一个 get 请求,用来测试登录 cookie 是否拿到 HttpGet g = new HttpGet("http://www.digifilm.com.cn/index.php/member/index");//获取登录后页面 //将 cookie 注入到 get 请求头当中。未得到 cookie 就会把请求头里的 cookie 清空。造成失败。 //可关闭的响应对象。 CloseableHttpResponse r = httpClient.execute(g); Header headers= r.getFirstHeader("Content-Length"); Integer contentLength = Integer.parseInt(headers.getValue()); if(contentLength > 7000){ r.close(); break; } r.close(); } //httpClient.close(); String rec2 = rec.split(";")[2]; return rec2; } } @Component public class WebMagicUtil implements PageProcessor { private int length; //部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数、超时时间等 private Site site ; public void setSite(String cookie) { this.site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(1000*60*60).setCycleRetryTimes(3) //添加 cookie 之前一定要先设置主机地址,否则 cookie 信息不生效 .setDomain("www.digifilm.com.cn") //添加获取的 cookie 信息; .addCookie("PHPSESSID",cookie) //添加请求头,网站会根据请求头判断该请求是由浏览器发起还是爬虫发起。 .addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393") .addHeader("Accept","text/html, application/xhtml+xml, image/jxr, */*") .addHeader("Accept-Encodin","gzip,deflate") .addHeader("Accept-Language","zh-CN") .addHeader("Connection","Keep-Alive"); //.addHeader("Referer","http://www.digifilm.com.cn/index.php/public/login");; } @Override //process 是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(Page page) { //在开始的页面抓(去到密钥列表和单个下载页面的连接) if(page.getUrl().regex("(.*/index\\.php/member/index)").match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class=leaguer]").links().regex("(.*/index\\.php/(\\w+)_down/index)").all()); } //在密钥列表页面抓(列表页码和单个下载页面的链接) if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/index)").match()) { page.addTargetRequests(page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlelb']").links().regex("(.*/index\\.php/(\\w+)_down/content/id/.*)").all()); //翻页链接 page.addTargetRequests(page.getHtml().xpath("//div[@class=fanye_1]").links().regex("(.*/index\\.php/(\\w+)_down/index\\?&p=\\d+)").all()); } //在密钥单个下载页面抽取信息。 if (page.getUrl().regex("(.*/index\\.php/(\\w+)_down/content/id/\\w+)").match()){ page.putField("filmTitle", page.getHtml().xpath("//div[@class='videoDescri']/span[1]/text()")); page.putField("filmSchedule", page.getHtml().xpath("//div[@class='videoDescri']/span[2]/text()")); page.putField("filmType", page.getHtml().xpath("//div[@class='videoDescri']/span[3]/text()")); page.putField("secretKey", page.getHtml().xpath("//div[@class='SMAMiddle SMAMiddlela']/ul/li/a[@class='load']").links().regex("(.*/download\\.php\\?mid=.*)").all()); List<String> list = page.getResultItems().get("secretKey"); for (String url: list) { try { System.out.println(url); downloadFromUrl(url,"C:\\360Downloads\\Test\\"); } catch (Exception e) { e.printStackTrace(); } } length++; } } //测试下载代码 public static String downloadFromUrl(String url,String dir) { try { URL httpurl = new URL(url); String fileName = getFileNameFromUrl(url); System.out.println(fileName); File saveDir = new File(dir); if (!saveDir.exists()) { saveDir.mkdir(); } File file = new File(saveDir + File.separator + fileName); file.createNewFile(); FileUtils.copyURLToFile( httpurl,file); } catch (Exception e) { e.printStackTrace(); return "Fault"; } return "Successful!"; } public static String getFileNameFromUrl(String url) { String name = new Long(System.currentTimeMillis()).toString() + ".xml"; return name; } ``` |
9
zoran 2016-11-02 18:36:14 +08:00
java 爬虫可以试试 https://github.com/zhuoran/crawler4j 记得点赞~~ :)
|
10
Yc1992 2016-11-02 18:37:04 +08:00
是不是网站设置了单点登录
|