java实现登录之后抓取数据

网友投稿 220 2023-07-24

java实现登录之后抓取数据

最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。

也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。

首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqOuHa

1,获取网页内容(核心代码,技术有限没封装)。

2,登录之后抓取网页数据(如何在请求中携带cookie)。

3,获取网站的ajax请求方法(返回json)。

以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)

一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试

package com.minxinloan.black.web.utils;

import java.io.BufferedReader;

import java.io.ByteArrayOutputStream;

import java.io.DataInputStream;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.io.PrintWriter;

import java.net.HttpURLConnection;

import java.net.URL;

import java.net.URLConnection;

import java.net.URLEncoder;

import java.nio.charset.Charset;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.Iterator;

import java.util.List;

import java.util.Map;

import java.util.Map.Entry;

import java.util.StringTokenizer;

import net.sf.json.JSONArray;

import net.sf.json.JSONObject;

import org.jsoup.Connection;

import org.jsoup.Connection.Method;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

public class CookieUtil {

public final static String CONTENT_TYPE = "Content-Type";

public static void main(String[] args) {

//String loginURL = "http://p2pHDNNWvTBSeye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567";

String listURL = "http://p2peye.com/blacklist.php?p=2";

String logURL = "http://p2peye.com/member.php";

//********************************需要登录的*************************************************

try {

Connection.Response res =

Jsoup.connect(logURL)

.data("mod","logging"

,"action","login"

,"loginsubmit","yes"

,"loginhash","Lsc66"

,"username","puqiuxiaomao"

,"password","a1234567")

.method(Method.POST)

.execute();

//这儿的SESSIONID需要根据要登录的目标网站设置的session Cookie名字而定

Connection con=Jsoup.connect(listURL);

//设置访问形式(电脑访问,手机访问):直接百度都参数设置

con.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");

//把登录信息的cookies保存如map对象里面

Map map=res.cookies();

Iterator> it =map.entrySet().iterator();

while(it.hasNext()){

Entry en= it.next();

//把登录的信息放入请求里面

con =con.cookie(en.getKey(), en.getValue());

}

//再次获取Document对象。

Document objectDoc = con.get();

Elements elements = objectDoc.getAllElements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)

for (Element element : elements) {

//element是迭代出来的标签:如:

Elements elements2= element.getAllElements();//

for (Element element2 : elements2) {

element2.text();

element2.attr("href");//获取标签属性。element2代表a标签:href代表属性

element2.text();//获取标签文本

}

}

//********************************不需要登录的*************************************************

String URL = "http://p2peye.com/blacklist.php?p=2";

Document conTemp = Jsoup.connect(URL).get();

Elements elementsTemps = conTemp.getAllElements();

for (Element elementsTemp : elementsTemps) {

elementsTemp.text();

elementsTemp.attr("href");//获取标签属性。element2代表a标签:href代表属性

elementsTemp.text();//获取标签文本

}

//********************************ajax方法获取内容。。。*************************************************。

HttpURLConnection connection = null;

BufferedReader reader = null;

try {

StringBuffer sb = new StringBuffer();

URL getUrl = new URL(URL);

connection = (HttpURLConnection)getUrl.openConnection();

reader = new BufferedReader(new InputStreamReader(

connection.getInputStream(),"utf-8"));

String lines;

while ((lines = reader.readLine()) != null) {

sb.append(lines);

};

List> list = parseJSON2List(sb.toString());//json转换成list

} catch (Exception e) {

} finally{

if(reader!=null)

try {

reader.close();

} catch (IOException e) {

}

// 断开连接

connection.disconnect();

}

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

public static Map parseJSON2Map(String jsonStr){

Map map = new HashMap();

//最外层解析

JSONObject json = JSONObject.fromObject(jsonStr);

for(Object k : json.keySet()){

Object v = json.get(k);

//如果内层还是数组的话,继续解析

if(v instanceof JSONArray){

List> list = new ArrayList>();

Iterator it = ((JSONArray)v).iterator();

while(it.hasNext()){

JSONObject json2 = it.next();

list.add(parseJSON2Map(json2.toString()));

}

map.put(k.toString(), list);

} else {

map.put(k.toString(), v);

}

}

return map;

}

public static List> parseJSON2List(String jsonStr){

JSONArray jsonArr = JSONArray.fromObject(jsonStr);

List> list = new ArrayList>();

Iterator it = jsonArr.iterator();

while(it.hasNext()){

JSONObject json2 = it.next();

list.add(parseJSON2Map(json2.toString()));

}

return list;

}

}

二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)

package com.minxinloan.black.web.utils;

import java.io.BufferedReader;

import java.io.DataInputStream;

import java.io.DataOutputStream;

import java.io.File;

import java.io.FileOutputStream;

import java.io.FileWriter;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.HttpURLConnection;

import java.net.URL;

import java.net.URLConnection;

import java.nio.charset.Charset;

import java.util.HHDNNWvTBSashMap;

import java.util.List;

import java.util.Map;

import java.util.StringTokenizer;

public class Utils {//解析验证码的

public static Content getRandom(String method, String sUrl,// 要解析的url

Map paramMap, // 存放用户名和密码的map

Map requestHeaderMap,// 存放COOKIE的map

boolean isOnlyReturnHeader, String path) {

Content content = null;

HttpURLConnection httpUrlConnection = null;

InputStream in = null;

try {

URL url = new URL(sUrl);

boolean isPost = "POST".equals(method);

if (method == null

|| (!"GET".equalsIgnoreCase(method) && !"POST"

.equalsIgnoreCase(method))) {

method = "POST";

}

URL resolvedURL = url;

URLConnection urlConnection = resolvedURL.openConnection();

httpUrlConnection = (HttpURLConnection) urlConnection;

httpUrlConnection.setRequestMethod(method);

httpUrlConnection.setRequestProperty("Accept-Language",

"zh-cn,zh;q=0.5");

// Do not follow redirects, We will handle redirects ourself

httpUrlConnection.setInstanceFollowRedirects(false);

httpUrlConnection.setDoOutput(true);

httpUrlConnection.setDoInput(true);

httpUrlConnection.setConnectTimeout(5000);

httpUrlConnection.setReadTimeout(5000);

httpUrlConnection.setUseCaches(false);

httpUrlConnection.setDefaultUseCaches(false);

httpUrlConnection.connect();

int responseCode = httpUrlConnection.getResponseCode();

if (responseCode == HttpURLConnection.HTTP_OK

|| responseCode == HttpURLConnection.HTTP_CREATED) {

byte[] bytes = new byte[0];

if (!isOnlyReturnHeader) {

DataInputStream ins = new DataInputStream(

httpUrlConnection.getInputStream());

// 验证码的位置

DataOutputStream out = new DataOutputStream(

new FileOutputStream(path + "/code.bmp"));

byte[] buffer = new byte[4096];

int count = 0;

while ((count = ins.read(buffer)) > 0) {

out.write(buffer, 0, count);

}

out.close();

ins.close();

}

String encoding = null;

if (encoding == null) {

encoding = getEncodingFromContentType(httpUrlConnection

.getHeaderField(""));

}

content = new Content(sUrl, new String(bytes, encoding),

httpUrlConnection.getHeaderFields());

}

} catch (Exception e) {

return null;

} finally {

if (httpUrlConnection != null) {

httpUrlConnection.disconnect();

}

}

return content;

}

public static String getEncodingFromContentType(String contentType) {

String encoding = null;

if (contentType == null) {

return null;

}

StringTokenizer tok = new StringTokenizer(contentType, ";");

if (tok.hasMoreTokens()) {

tok.nextToken();

while (tok.hasMoreTokens()) {

String assignment = tok.nextToken().trim();

int eqIdx = assignment.indexOf('=');

if (eqIdx != -1) {

String varName = assignment.substring(0, eqIdx).trim();

if ("charset".equalsIgnoreCase(varName)) {

String varValue = assignment.substring(eqIdx + 1)

.trim();

if (varValue.startsWith("\"")

&& varValue.endsWith("\"")) {

// substring works on indices

varValue = varValue.substring(1,

varValue.length() - 1);

}

if (Charset.isSupported(varValue)) {

encoding = varValue;

}

}

}

}

}

if (encoding == null) {

return "UTF-8";

}

return encoding;

}

// 这个是输出

public static boolean inFile(String content, String path) {

PrintWriter out = null;

File file = new File(path);

try {

if (!file.exists()) {

file.createNewFile();

}

out = new PrintWriter(new FileWriter(file));

out.write(content);

out.flush();

return true;

} catch (Exception e) {

e.printStackTrace();

} finally {

out.close();

}

return false;

}

public static String getHtmlReadLine(String httpurl) {

String CurrentLine = "";

String TotalString = "";

InputStream urlStream;

String content = "";

try {

URL url = new URL(httpurl);

HttpURLConnection connection = (HttpURLConnection) url

.openConnection();

connection.connect();

System.out.println(connection.getResponseCode());

urlStream = connection.getInputStream();

BufferedReader reader = new BufferedReader(

new InputStreamReader(urlStream, "utf-8"));

while ((CurrentLine = reader.readLine()) != null) {

TotalString += CurrentLine + "\n";

}

content = TotalString;

} catch (Exception e) {

}

return content;

}

}

class Content {

private String url;

private String body;

private Map> m_mHeaders = new HashMap>();

public Content(String url, String body, Map> headers) {

this.url = url;

this.body = body;

this.m_mHeaders = headers;

}

public String getUrl() {

return url;

}

public String getBody() {

return body;

}

public Map> getHeaders() {

return m_mHeaders;

}

}

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:java如何使用自己的maven本地仓库详解
下一篇:java socket实现聊天室 java实现多人聊天功能
相关文章

 发表评论

暂时没有评论,来抢沙发吧~