`
kevinflynn
  • 浏览: 39013 次
  • 性别: Icon_minigender_1
  • 来自: 南京
社区版块
存档分类
最新评论

模拟登陆百度的Java实现

阅读更多

常常需要爬取百度统计出来的数据,难免要进行百度的模拟登陆!现将程序贴出来,供他人也供自己以后使用:

 

 

package org.baidu;


import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;

/**
 * HTTPCLIENT请求封装。
 * @author kevin
 */
@SuppressWarnings("deprecation")
public class BaiduConnectService {

	private CookieStore cookieStore = new BasicCookieStore();
	
	private BaiduConnectService(){}
	
	private static class BaiduConnectServiceContainer{
		private static BaiduConnectService bc = new BaiduConnectService();
	}
	
	public static BaiduConnectService getInstance(){
		System.out.println("初始化:BaiduConnectService.");
		return BaiduConnectServiceContainer.bc;
	}
	
	public HttpResponse execute(String url) throws Exception{
		return this.execute(url,null);
	}
	
	public HttpResponse execute(String url, List<NameValuePair> params) throws Exception{
		HttpClient httpClient = new DefaultHttpClient(
				new ThreadSafeClientConnManager());
		HttpResponse response = null;
		HttpUriRequest request = null;
		if (params != null) {
			HttpPost httpPost = new HttpPost(url);
			try {
				HttpEntity postBodyEnt = new UrlEncodedFormEntity(params);
				httpPost.setEntity(postBodyEnt);
			} catch (Exception e) {
				e.printStackTrace();
			}
			request = httpPost;
		} else {
			HttpGet httpGet = new HttpGet(url);
			request = httpGet;
		}
		HttpContext localContext = new BasicHttpContext();
		localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
		response = httpClient.execute(request, localContext);
		System.out.println("[HTTP状态码:" + response.getStatusLine().getStatusCode() + "]" + "-->Request URL:" + url);
		return response;
	}

	public CookieStore getCookieStore() {
		return cookieStore;
	}

	public void setCookieStore(CookieStore cookieStore) {
		this.cookieStore = cookieStore;
	}
}

 

package org.baidu;


import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/**
 * 百度登录SERVICE.
 * @author kevin
 */
public class BaiduLoginService {

	private BaiduConnectService bc = BaiduConnectService.getInstance();
	private static final String BAIDU_URL = "http://www.baidu.com";
	private static final String TOKEN_GET_URL = "https://passport.baidu.com/v2/api/?getapi&tpl=mn&apiver=v3&class=login&logintype=dialogLogin";
	private static final String LOGIN_POST_URL = "https://passport.baidu.com/v2/api/?login";
	private static final String QUERY_GET_URL="http://index.baidu.com/?tpl=trend&word=%D5%F7%B2%F0";
	
	private String username;
	private String password;
	private String verifycode;
	private String codestring;
	private String token;
	
	public BaiduLoginService(String username,String password,String verifycode,String codestring){
		this.username = username;
		this.password = password;
		this.verifycode = verifycode;
		this.codestring = codestring;
	}
	
	// test
	public static void main(String[] args) throws Exception{
		// 请确保你在www.baidu.com可以登录成功。
		new BaiduLoginService("账号", "密码","","").login();
		// 下载百度文库。
//		new BaiduDownloadService("http://wenku.baidu.com/view/71ce3ec60c22590102029dd1.html").download();
	}
	
	public void login() throws Exception{
		System.out.println("准备登录 . Usename:"+username);
		// 预登录,获取cookie以便获取token.
		bc.execute(BAIDU_URL);
		this.initToken();
		System.out.println("正在登录。");
		HttpResponse response = bc.execute(LOGIN_POST_URL, produceFormEntity());
		String result = EntityUtils.toString(response.getEntity());
		String statusCode = this.substring(result, "error=", "'");
		System.out.println("百度返回的状态码:" + statusCode);
		// 自动识别验证码。
//		 tools.autoCode(codestring);
		EntityUtils.consume(response.getEntity());
		System.out.println("--------------------------------");
		if(!checkLogin()){
			System.out.println("登录异常或频繁,需要验证码,codeString为:" + this.substring(result, "codestring=", "&"));
			System.out.println("登录结果:" + username + " 登录失败.");	
		}else{
			System.out.println("登录结果:" + " 登录成功.");
		}
		
//		this.queryKeywordsUrl();
		
		
	}
	
	private void queryKeywordsUrl() throws Exception {
		System.out.println("获取关键词的百度指数...");
		HttpResponse response = bc.execute(QUERY_GET_URL);
		String str = EntityUtils.toString(response.getEntity());
		System.out.println(str);

//        未被收录,如要查看相关数据,您需要购买创建新词的权限。
		if(str.contains("未被收录")){
			System.out.println("关键词未被收录");
		}else{
			System.out.println("关键词已被收录");
		}
//		Pattern pattern = Pattern.compile("token\" : \"(.*?)\"");
//		Matcher matcher = pattern.matcher(str);
//		if(matcher.find()){
//			token = matcher.group(1);
//		}
//		System.out.println("Token已获取:"+token);
	}

	public NodeList getNodeByName(String content,String tag,String name){
		Parser parser = Parser.createParser(content, "utf-8");
		AndFilter filter = new AndFilter(new TagNameFilter(tag),new HasAttributeFilter("name",name));
		try {
			return parser.parse(filter);
		} catch (ParserException e) {
			e.printStackTrace();
			return null;
		}
	}
	
	// 登录POST参数
	private List<NameValuePair> produceFormEntity() throws UnsupportedEncodingException{
		List<NameValuePair> list = new ArrayList<NameValuePair>();
		list.add(new BasicNameValuePair("tt", ""+System.currentTimeMillis()));
		list.add(new BasicNameValuePair("tpl", "mn"));
		list.add(new BasicNameValuePair("token", token));
		list.add(new BasicNameValuePair("isPhone", ""));
		list.add(new BasicNameValuePair("username", username));
		list.add(new BasicNameValuePair("password", password));
		list.add(new BasicNameValuePair("verifycode", verifycode));
		list.add(new BasicNameValuePair("codestring", codestring));
		return list;
	}
	
	private void initToken() throws Exception{
		System.out.println("获取百度Token...");
		HttpResponse response = bc.execute(TOKEN_GET_URL);
		String str = EntityUtils.toString(response.getEntity());
		Pattern pattern = Pattern.compile("token\" : \"(.*?)\"");
		Matcher matcher = pattern.matcher(str);
		if(matcher.find()){
			token = matcher.group(1);
		}
		System.out.println("Token已获取:"+token);
	}
	
	private boolean checkLogin() throws Exception{
		HttpResponse response = bc.execute(BAIDU_URL);
		boolean res = false;
		String content = EntityUtils.toString(response.getEntity());
		if(!content.contains("登录")){
			res = true;
		}
		EntityUtils.consume(response.getEntity());
		return res;
	}
	
	public static String substring(String str, String s1, String s2) {
		// 1、先获得0-s1的字符串,得到新的字符串sb1
		// 2、从sb1中开始0-s2获得最终的结果。
		try {
			StringBuffer sb = new StringBuffer(str);
			String sb1 = sb.substring(sb.indexOf(s1) + s1.length());
			return String.valueOf(sb1.substring(0, sb1.indexOf(s2)));
		} catch (StringIndexOutOfBoundsException e) {
			return str;
		}
	}
}

   运行BaiduLoginService即可实现登陆!

 

   项目所需的jar包如下:

   项目所需jar包

 

 

  • 大小: 9.9 KB
0
4
分享到:
评论
4 楼 kevinflynn 2015-06-18  
qindongliang1922 写道
验证码能自动破解么

没有写有验证码的功能!Sorry!
3 楼 qindongliang1922 2015-06-17  
验证码能自动破解么
2 楼 kevinflynn 2015-06-17  
是这样的!还是得不断地学习,不断地总结经验才可以!
1 楼 hellohank 2015-06-17  
现在的很多网页,包括百度,页面上大量使用了js动态加载或js解析等方式展示内容,使用httpCleint方式已经不能很好的抓取这些信息了~
前两天我抓取一些百度地图上的一些信息时,一开始也是使用HttpClient,结果……被迫写了一个基于WebDriver的。HttpClient速度快,但不支持js动态内容;WebDriver完全调用浏览器,但效率低!

相关推荐

Global site tag (gtag.js) - Google Analytics