時(shí)間:2023-03-13 22:54:01 | 來源:電子商務(wù)
時(shí)間:2023-03-13 22:54:01 來源:電子商務(wù)
今天突然看到人人網(wǎng)的一個(gè)鏈接,想起前幾年可是很火的大學(xué)生社交網(wǎng)站,登錄自己的賬號進(jìn)去看后,發(fā)現(xiàn)這網(wǎng)站居然開始搞直播,不是之前那么單純的社交功能網(wǎng)站了,不過發(fā)現(xiàn)人人網(wǎng)的數(shù)據(jù)還挺有意思的,可以爬取到不認(rèn)識的人的社交資料,比如性別生日,所讀學(xué)校,好友列表的,這個(gè)數(shù)據(jù)可以拿來做一個(gè)社交關(guān)系網(wǎng)狀圖,甚至拿來檢驗(yàn)六度分隔理論,想想還有點(diǎn)小興奮,于是就開始研究怎么爬取人人網(wǎng)的數(shù)據(jù)。package http;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.NameValuePair;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.HttpClients;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;import util.RegexUtil;import java.util.*;/** * Created by on 2017/12/23. */public class HttpUtil { public String doGet(String url, Map<String, String> headers) { HttpClient httpClient; HttpGet httpGet; String result = null; try { httpClient = HttpClients.createDefault(); httpGet = new HttpGet(url); Iterator<Map.Entry<String, String>> it = headers.entrySet().iterator(); while (it.hasNext()) { Map.Entry<String, String> entry = it.next(); String key = entry.getKey(); String val = entry.getValue(); httpGet.addHeader(key, val); } HttpResponse response = httpClient.execute(httpGet); if (response != null) { HttpEntity resEntity = response.getEntity(); if (resEntity != null) { result = EntityUtils.toString(resEntity, "utf8"); } } } catch (Exception ex) { System.out.println("http get失敗"); } return result; } public String doPost(String url, Map<String, String> params, Map<String, String> headers) { HttpClient httpClient; HttpPost httpPost; String result = null; try { httpClient = HttpClients.createDefault(); httpPost = new HttpPost(url); Iterator<Map.Entry<String, String>> it = headers.entrySet().iterator(); while (it.hasNext()) { Map.Entry<String, String> entry = it.next(); String key = entry.getKey(); String val = entry.getValue(); httpPost.addHeader(key, val); } Iterator iterator = params.entrySet().iterator(); List<NameValuePair> list = new ArrayList<NameValuePair>(); while (iterator.hasNext()) { Map.Entry<String, String> elem = (Map.Entry<String, String>) iterator.next(); list.add(new BasicNameValuePair(elem.getKey(), elem.getValue())); } if (list.size() > 0) { UrlEncodedFormEntity entity = new UrlEncodedFormEntity(list, "utf8"); httpPost.setEntity(entity); } HttpResponse response = httpClient.execute(httpPost); if (response != null) { HttpEntity resEntity = response.getEntity(); if (resEntity != null) { result = EntityUtils.toString(resEntity, "utf8"); } } } catch (Exception ex) { System.out.println("http post失敗"); } return result; } public static void main(String[] args) { HttpUtil util = new HttpUtil(); Map<String, String> headers = new HashMap<String, String>(); headers.put("Cookie", "你的cookie串"); String htmlStr = util.doGet("http://www.renren.com/494871890/profile", headers); Map<String,String> userInfo = RegexUtil.groupUserInfo(htmlStr); System.out.println(userInfo); }}
package util;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by on 2017/12/24. */public class RegexUtil { public static List<String> getValByReg(String str, String reg, int valueNum) { List<String> rets = null; Pattern p = Pattern.compile(reg); Matcher m = p.matcher(str); if (m.find()) { if (m.groupCount() == valueNum) { rets = new ArrayList<String>(); for (int i = 1; i <= valueNum; i++) rets.add(m.group(i)); } } return rets; } /* * 獲取用戶標(biāo)題欄消息工具 * **/ public static Map<String, String> groupUserInfo(String htmlStr) { Map<String, String> retMap = new HashMap<String, String>(); try { String infoReg = "(<div class=/"tl-information/"[//s//S]+</div>)"; //獲取到存儲用戶信息的div List<String> rets = RegexUtil.getValByReg(htmlStr, infoReg, 1); if (rets == null || rets.size() < 1) { return retMap; } // 將div中的信息匹配出來 String div = rets.get(0).replace("/n", ""); Pattern p = Pattern.compile("<li class=/"(.*?)/">(.*?)</li>"); Matcher m = p.matcher(div); while (m.find()) { String attr = m.group(1); String value = m.group(2).replaceAll("(<span>|</span>| )", ""); retMap.put(attr, value); } } catch (Exception e) { return retMap; } return retMap; }}
于是運(yùn)行上面httpUtil中的主類方法我們可以輸出的結(jié)果就是{birthday=女生,二月十二日, hometown=來自河南商丘市, address=現(xiàn)居重慶, school=就讀于重慶郵電大學(xué)}
這就是說我們已經(jīng)有了一個(gè)可以根據(jù)id拿到用戶主頁標(biāo)題欄信息的工具了package renren;import java.util.Set;/** * Created by on 2017/12/24. */public class User { private Stringd renrenId; // 人人id private String renrenName; // 人人昵稱 private String renrenInfo; //人人標(biāo)題欄信息 private int visitTime; //該人主頁被訪問次數(shù) private Set<String> allFriendsId; // 該人的所有好友人人id public User() { } public User(String renrenId, String renrenName, String renrenInfo, int visitTime, Set<String> allFriendsId) { this.renrenId = renrenId; this.renrenName = renrenName; this.renrenInfo = renrenInfo; this.visitTime = visitTime; this.allFriendsId = allFriendsId; } public String getRenrenId() { return renrenId; } public void setRenrenId(String renrenId) { this.renrenId = renrenId; } public String getRenrenInfo() { return renrenInfo; } public void setRenrenInfo(String renrenInfo) { this.renrenInfo = renrenInfo; } public int getVisitTime() { return visitTime; } public void setVisitTime(int visitTime) { this.visitTime = visitTime; } public String getRenrenName() { return renrenName; } public void setRenrenName(String renrenName) { this.renrenName = renrenName; } public Set<String> getAllFriendsId() { return allFriendsId; } public void setAllFriendsId(Set<String> allFriendsId) { this.allFriendsId = allFriendsId; } @Override public String toString() { return String.format("%s---%s---%s---%d", renrenId, renrenName, renrenInfo, visitTime); }}
可以看到我們再bean中還定義了 用戶昵稱,用戶主頁面訪問次數(shù),好友id集合的字段,好友id集合獲取方式會在后面介紹,有點(diǎn)小復(fù)雜package util;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.regex.Matcher;import java.util.regex.Pattern;/** * Created by on 2017/12/24. */public class RegexUtil { /* * 獲取用戶昵稱信息 * **/ public static String groupUsername(String htmlStr) { String nameReg = "<title>人人網(wǎng) - (.*)</title>"; List<String> rets = RegexUtil.getValByReg(htmlStr, nameReg, 1); if (rets != null && rets.size() > 0) return rets.get(0); return null; } //獲取用戶主頁面訪問次數(shù) public static int groupVisitTime(String htmlStr){ htmlStr.replace("/n",""); String visitTimeReg = "(<div id=/"footprint-box/"[//s//S]*/h5>)"; List<String> rets = RegexUtil.getValByReg(htmlStr, visitTimeReg, 1); if(rets!=null && rets.size()>0){ String div = rets.get(0).replace("/n",""); String visitTime = div.replaceAll(".*最近來訪.*?(//d+).*","$1"); if(visitTime.matches("//d+")) return Integer.parseInt(visitTime); } return 0; }}
現(xiàn)在我們能拿到的信息有 用戶id,用戶昵稱,用戶標(biāo)題欄信息(地址,大學(xué)等),主頁訪問次數(shù) , 編寫一個(gè)主類來試一下package http;import net.sf.json.JSONObject;import org.apache.http.HttpEntity;import org.apache.http.HttpResponse;import org.apache.http.NameValuePair;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.impl.client.HttpClients;import org.apache.http.message.BasicNameValuePair;import org.apache.http.util.EntityUtils;import renren.User;import util.RegexUtil;import java.util.*;/** * Created by on 2017/12/23. */public class Test { public static void main(String[] args) { HttpUtil util = new HttpUtil(); Map<String, String> headers = new HashMap<String, String>(); // Cookie headers.put("Cookie","你的cookie"); String renrenId ="494871890"; String htmlStr = util.doGet("http://www.renren.com/"+renrenId+"/profile", headers); //獲取用戶信息 Map<String,String> userInfo = RegexUtil.groupUserInfo(htmlStr); JSONObject jsonObj = JSONObject.fromObject(userInfo); String renrenInfo = jsonObj.toString(); //獲取用戶姓名 String userName = RegexUtil.groupUsername(htmlStr); //獲取用戶主頁面訪問次數(shù) int visitTime= RegexUtil.groupVisitTime(htmlStr); User user = new User(renrenId,userName,renrenInfo,visitTime,null); System.out.println(user); }}
輸出結(jié)果為494871890---李瑤玉---{"birthday":"女生,二月十二日","hometown":"來自河南商丘市","address":"現(xiàn)居重慶","school":"就讀于重慶郵電大學(xué)"}---62
那么我們根據(jù)一個(gè)id獲取到用戶信息并且填入bean已經(jīng)可以成功了,后續(xù)會講到關(guān)鍵詞:數(shù)據(jù),社交,爬蟲
客戶&案例
營銷資訊
關(guān)于我們
微信公眾號
版權(quán)所有? 億企邦 1997-2025 保留一切法律許可權(quán)利。