龙空技术网

微博扒取网络数据,针对上一条发文视频附上源码供初学者借鉴

码农唐小生 39

前言:

而今同学们对“php微博发布”大体比较关切,大家都想要剖析一些“php微博发布”的相关内容。那么小编同时在网络上搜集了一些关于“php微博发布””的相关内容,希望兄弟们能喜欢,我们快快来学习一下吧!

String strURL=";;// 5723344072

URL url=null;

HttpURLConnection httpConn=null;

url = new URL(strURL);

httpConn = (HttpURLConnection) url.openConnection();

//String c="SUB=_2AkMqj-zif8NxqwJRmfkcyG7la4R0ygjEieKc0x05JRMxHRl-yT9jqhUitRB6AQ_CDRrmGwjoWaf2alXg9Yfxki-R4Nwe; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gfVwXwLLzATj6ArcV1q7i; SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; UOR=,,localhost:8080; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; wb_view_log=1920*10801; TC-Page-G0=1ae767ccb34a580ffdaaa3a58eb208b8|1584343362|1584343362";

//String c="SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; WBtopGlobal_register_version=3d5b6de7399dfbdb; wb_view_log_6439293145=1920*10801; wb_view_log=1920*10801; UOR=,,; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5K2hUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1615971499; SSOLoginState=1584435500; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJHcrUMAiEDq2Fby6kEoayWwopa6y9lMEbqh1h7NHOffM.; SUB=_2A25zdOF8DeRhGeBP6VcW8SjLzDWIHXVQAFW0rDV8PUNbmtANLUr-kW9NRWCbkhivb5UzMh1zGT7KgW6D-dSnnHFj; SUHB=0sqBpKsKWKrq8Z; un=18595757685; wvr=6; wb_view_log_6125716779=1920*10801; TC-Page-G0=1ae767ccb34a580ffdaaa3a58eb208b8|1584440113|1584440107; webim_unReadCount=%7B%22time%22%3A1584440351054%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A3%2C%22msgbox%22%3A0%7D";

//String c="SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; WBtopGlobal_register_version=3d5b6de7399dfbdb; wb_view_log_6439293145=1920*10801; wb_view_log=1920*10801; UOR=,,; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5K2hUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1615971499; SSOLoginState=1584435500; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJHcrUMAiEDq2Fby6kEoayWwopa6y9lMEbqh1h7NHOffM.; SUB=_2A25zdOF8DeRhGeBP6VcW8SjLzDWIHXVQAFW0rDV8PUNbmtANLUr-kW9NRWCbkhivb5UzMh1zGT7KgW6D-dSnnHFj; SUHB=0sqBpKsKWKrq8Z; un=18595757685; wvr=6; wb_view_log_6125716779=1920*10801; webim_unReadCount=%7B%22time%22%3A1584440603192%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A3%2C%22msgbox%22%3A0%7D; TC-Page-G0=b993e9b6e353749ed3459e1837a0ae89|1584440608|1584440580";

String c="SINAGLOBAL=2525797642447.1143.1576751690811; UOR=,,login.sina.com.cn; TC-V5-G0=595b7637c272b28fccec3e9d529f251a; SSOLoginState=1585210218; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wvr=6; _s_tentry=weibo.com; Apache=7211436044072.67.1585211180994; ULV=1585211181930:3:2:1:7211436044072.67.1585211180994:1583116766636; SUB=_2AkMp3ULYf8PxqwJRmfkcyG7la4R0ygjEieKfgbMDJRMxHRl-yT9jqk8GtRB6Al1sKDCUM-bsv44hS2JWofGDBG0WLLhQ; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gfVwXwLLzATj6ArcV1q7i; TC-Page-G0=62b98c0fc3e291bc0c7511933c1b13ad|1585565168|1585565167";

//String c="SINAGLOBAL=2525797642447.1143.1576751690811; UOR=,,login.sina.com.cn; TC-V5-G0=595b7637c272b28fccec3e9d529f251a; SSOLoginState=1585210218; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wvr=6; _s_tentry=weibo.com; Apache=7211436044072.67.1585211180994; ULV=1585211181930:3:2:1:7211436044072.67.1585211180994:1583116766636; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5KMhUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1616895226; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJIndU4aQ389BrJPQyB4i6Qj847pZmhvQfZIHMPwC8ARc.; SUB=_2A25zetktDeRhGeBP6VcW8SjLzDWIHXVQDk3lrDV8PUNbmtAKLWXFkW9NRWCbkkZQxvxIVxXfNU1QVQBfoeucUtmz; SUHB=0LcSPzPnzcU0HI; wb_view_log_6125716779=1920*10801; TC-Page-G0=841d8e04c4761f733a87c822f72195f3|1585363186|1585363180; webim_unReadCount=%7B%22time%22%3A1585363187658%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D";

//第一重点 微博扒取数据,因为微博用了Cookie,所以我们扒取时爬到的是个接近空的网页,里面什么内容也没有,跟电脑上F12看到的完全不一样

httpConn.setRequestProperty("Cookie", c);

httpConn.setRequestProperty("charset", "utf-8");

InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), "utf-8");

BufferedReader bufReader = new BufferedReader(input);

String line = "";

StringBuilder contentBuf = new StringBuilder();

while ((line = bufReader.readLine()) != null) {

contentBuf.append(line);

}

String buf = contentBuf.toString();

System.out.println(buf);

Document document = Jsoup.parse(buf);

//第二重点 微博上面的数据都是用PHP写的,页面的内容也都是用script渲染上面的,所以我们要获取script中渲染的数据,script渲染数据也是个正常的json数据,所一般能拿到指定的script很重要,微博里面所有的内容并不是由一个script渲染的,

//他由很多的script渲染,所以要想拿到指定的内容就要拿到指定的script,下面看怎么找script。所以找到指定的script很重要。

Elements elements=document.select("script");

for (Element element : elements) {

//解析script

String s1=element.data().split("<script>FM.view")[0];

//一层一层 抽丝剥茧拿到自己的内容

if (s1.contains("\"html\":\"")) {

if(s1.split("\"html\":\"")[0].contains("Pl_Official_Headerv6__1")) {

String content = s1.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

Document header = Jsoup.parse(content);

Elements headerphoto= header.getElementsByClass("photo");

Elements username= header.getElementsByClass("username");

String nickName=username.text();

String img_url=headerphoto.attr("src");

// w.setNickname(nickName);

// w.setImg_url(img_url);

System.out.println(nickName);

System.out.println(img_url);

}

if(s1.split("\"html\":\"")[0].contains("Pl_Core_T8CustomTriColumn__3")) {

String content = s1.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

Document header = Jsoup.parse(content);

Elements data= header.getElementsByClass("W_f14");

if(data.size()==0) {

data= header.getElementsByClass("W_f16");

}

if(data.size()==0) {

data= header.getElementsByClass("W_f18");

}

String fun=data.get(1).text();

// w.setFan_num(fun);

System.out.println(fun);

}

}

}

/*//System.out.println(elements);

System.out.println(elements.size());

String PCD_header=elements.get(10).data().split("<script>FM.view")[0];

if (PCD_header.contains("\"html\":\"")) {

String content = PCD_header.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

Document header = Jsoup.parse(content);

Elements headerphoto= header.getElementsByClass("photo");

Elements username= header.getElementsByClass("username");

String nickName=username.text();

String img_url=headerphoto.attr("src");

System.out.println(nickName);

System.out.println(img_url);

}

//关注 粉丝 发布微博数

String Column__3=elements.get(13).data().split("<script>FM.view")[0];

//System.out.println(Column__3);

if (Column__3.contains("\"html\":\"")) {

String content = Column__3.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

Document header = Jsoup.parse(content);

Elements data= header.getElementsByClass("W_f14");

//String guanzhu=data.get(0).text();

String fun=data.get(1).text();

//String weibo=data.get(2).text();

//System.out.println(guanzhu);

System.out.println(fun);

// System.out.println(weibo);

} */

/*System.out.println("--------------------------------------------------------");

//时间 转发 评论 赞 带U的微博链接32 不带的31

String Pl_Official_MyProfileFeed__21=elements.get(32).data().split("<script>FM.view")[0];

// System.out.println(Pl_Official_MyProfileFeed__21);

if (Pl_Official_MyProfileFeed__21.contains("\"html\":\"")) {

String content = Pl_Official_MyProfileFeed__21.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");

content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13);

Document header = Jsoup.parse(content);

// System.out.println(header);

Elements data= header.getElementsByClass("WB_feed_like");

for (Element element : data) {

// element.text();S_txt2

Elements WB_from=element.getElementsByClass("WB_from");

String time=WB_from.get(0).getElementsByTag("a").get(0).text();

String href=WB_from.get(0).getElementsByTag("a").get(0).attr("href");

System.out.println(time);

System.out.println(href);

Elements WB_row_line=element.getElementsByClass("WB_row_line");

Elements S_line1=WB_row_line.get(0).getElementsByClass("S_line1");

String zhuanfa=S_line1.get(1).getElementsByTag("em").get(1).text();

if(zhuanfa.equals("转发")) {

zhuanfa="0";

}

System.out.println(zhuanfa);

String pinglun=S_line1.get(3).getElementsByTag("em").get(1).text();

System.out.println(pinglun);

String zan=S_line1.get(5).getElementsByTag("em").get(1).text();

System.out.println(zan);

}

} */

}

标签: #php微博发布 #微博源码php