网页抓取中避免多次登录

网页抓取中要尽量避免多次登录,理由是:

1.多次登录浪费资源

2.登录次数过多会导致账号禁用或被封

所以我们要把cookie信息存起来,下次使用只有登录才能使用的功能时,直接装入cookie信息就行了。

但是有可能带来一个问题,那就是有的网站一次登录操作的次数有限制,那么你写个脚本隔特定时间清除cookie重新登录就可以了,详细例子见上一篇文章,贴吧发帖机、盖楼机

百度贴吧发帖机、盖楼机程序(PHP版)

不说废话,翠花,上代码

 
[php]
<?php
class httpconnector {
/**Curl类
*
*/
private $curl;
/**cookie字符串
*/
private $cookie;
private $read_cookie=false;

/**get方式下载网页内容
*@param $url
*@return web conntent
*/
function __construct(){
$handle = fopen(‘./cookie.txt’, ‘r’);
$content=fgets($handle, 1024);
if($content!=""){
$this->cookie=$content;
$this->read_cookie=true;
}

fclose($handle);;

}
public function get($url) {

$this->curl = curl_init();

curl_setopt($this->curl, CURLOPT_URL, $url);

// 设置header
curl_setopt($this->curl, CURLOPT_HEADER, 1);
curl_setopt($this->curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookie);//设置cookie 以分号分隔

// 设置cURL 参数,要求结果保存到字符串中还是输出到屏幕上。
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, 1);

// 运行cURL,请求网页
$data = curl_exec($this->curl);
// 关闭URL请求
curl_close($this->curl);
// 找到cookie 放入cookiestring
preg_match_all("/Set-Cookie:(.*?);/", $data, $match, PREG_SET_ORDER);
foreach ($match as $r) {
if ($this->cookie != ”) {
$this->cookie = $this->cookie . ‘;’;
}
if (isset($r[1])) {
$this->cookie .= trim(str_replace("rn", "", $r[1]));
}
}

$handle = fopen(‘./cookie.txt’, ‘w+’);
fwrite($handle,$this->cookie);

fclose($handle);;

return $data;

}

/**POST方式下载网页内容
*@param $url
*@param $params post的信息串
*@return web conntent
*/
public function post($url, $params) {

$this->curl = curl_init();

curl_setopt($this->curl, CURLOPT_URL, $url);

// 设置header
curl_setopt($this->curl, CURLOPT_HEADER, 1);
curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookie);
curl_setopt($this->curl, CURLOPT_POST, 1);
curl_setopt($this->curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($this->curl, CURLOPT_POSTFIELDS, $params);

// 设置cURL 参数,要求结果保存到字符串中还是输出到屏幕上。
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, 1);

// 运行cURL,请求网页
$data = curl_exec($this->curl);

// 关闭URL请求
curl_close($this->curl);
// 找到cookie 放入cookiestring
preg_match_all("/Set-Cookie:(.*?);/", $data, $match, PREG_SET_ORDER);

foreach ($match as $r) {
if ($this->cookie != ”) {
$this->cookie = $this->cookie . ‘;’;
}
if (isset($r[1])) {
$this->cookie .= trim(str_replace("rn", "", $r[1]));
}
}

$handle = fopen(‘./cookie.txt’, ‘w+’);
fwrite($handle,$this->cookie);

fclose($handle);;

return $data;

}
}
class tieba{
/*抓取编码GBK,抓取贴吧名也为GBK
*回复编码utf-8,登录也用utf-8
*/
private $http;

function __construct(){
$this->http=new httpconnector;

}
function login($username,$pwd){
$data= $this->http->post(‘http://wappass.baidu.com/passport/’ , "login_username=".$username."&login_loginpass=".$pwd."&login=yes&aa=登录&can_input=0");

if(strpos($data,"密码错误")>0){
return 0;
}else if(strpos($data,"验证码")>0) {
return 1;
}else if(strpos($data,"账户不存在")>0){
return 2;
}else {
return 3;

}

}
function send($tiebaname,$title,$content){
$url="http://wapp.baidu.com/f/?kw=".$tiebaname;
$data=$this->http->get($url);
$data=explode(‘<div class="d h">’,$data);
$data=explode(‘</form>’,$data[1]);
$pos=strpos("action ",$data[0]);
$action=substr($data[0],$pos+15);
$action=explode(‘"’,$action);
$action=$action[0];
$data=explode("<input",$data[0]);
$num=count($data);
$postdata="";
for($i=3;$i<$num;$i++){
$pos1=strpos($data[$i],"name");
$name=substr($data[$i],$pos1+6);
$name=explode(‘"’,$name);
$name=$name[0];
$pos1=strpos($data[$i],"value");
$value=substr($data[$i],$pos1+7);
$value=explode(‘"’,$value);
$value=$value[0];
$postdata.=$name."=".$value."&";
if($name==’sub1′)
break;
}
$postdata.="ti=".$title."&co= ".$content;
$posturl="http://wapp.baidu.com/".$action;
$data=$this->http->post($posturl,$postdata);
$data=$this->http->get($url);
/**注意置顶帖带来的误差
*/
$tid=explode(‘<div class="i">’,$data);
$num=count($tid);
for($i=1;$i<$num;$i++){
if(strpos($tid[$i],'<span class="light">’)>0){

}else{

$tid=explode("m?kz=",$tid[$i]);
$tid=explode("&",$tid[1]);
$tid=$tid[0];
break;

}

}

return $tid;

}
function logout(){
$url="http://wapp.baidu.com";
$data=$this->http->get($url);
$data=explode("退出</a>",$data);
$data=explode(‘href="’,$data[0]);
$count=count($data);
$data=explode(‘"’,$data[$count-1]);
$logouturl=$data[0];
$ret=$this->http->get($logouturl);
return $ret;

}
function replay($tid,$content){
$url="http://wapp.baidu.com/f/?kz=".$tid;

$data=$this->http->get($url);
echo $data;
$data=explode(‘<div class="d h">’,$data);
$data=explode(‘</form>’,$data[1]);
$pos=strpos("action ",$data[0]);
$action=substr($data[0],$pos+15);
$action=explode(‘"’,$action);
$action=$action[0];
$data=explode("<input",$data[0]);
$num=count($data);
$postdata="";
for($i=3;$i<$num;$i++){
$pos1=strpos($data[$i],"name");
$name=substr($data[$i],$pos1+6);
$name=explode(‘"’,$name);
$name=$name[0];
$pos1=strpos($data[$i],"value");
$value=substr($data[$i],$pos1+7);
$value=explode(‘"’,$value);
$value=$value[0];
$postdata.=$name."=".$value."&";
if($name==’sub1′)
break;
}
$postdata.="co=".$content;
$url="http://wapp.baidu.com/".$action;
echo $url;
$data=$this->http->post($url,$postdata);
echo $data;

}
/*
*抓取的贴吧名和返回的结果都是GBK
*/
function crul($tiebaname){
$tiebaname= iconv("UTF-8","GBK", $tiebaname);
$url="http://tieba.baidu.com/f?kw=".$tiebaname;
$content=$this->http->get($url);
$out=explode("</tr>",$content);
$num=count($out)-7;
$result=array();
for($i=1;$i<$num+1;$i++){
$tli=explode("</td>",$out[$i]);
$pos1 = (int)strpos($tli[0], ‘<td nowrap>’);
$clickcount=substr($tli[0],$pos1+11);
$replaynum=$this->getNum($tli[1]);
$author=trim(strip_tags($tli[3]));
$replay=$this->getChinese($tli[4]);
$href=$tli[2];
$pos1 = (int)strpos($href, ‘"_blank"’);
$pos2 = (int)strpos($href, ‘</a>’);
$title=substr($href,$pos1+9,$pos2-$pos1-5);
$title=strip_tags($title);
$pos1 = (int)strpos($href, ‘/p’);
$tid=substr($href,$pos1+3,10);
if($replay=="")
$replay=$author;
$result[]=array("clickcount"=>$clickcount ,"replaynum"=>$replaynum,"title"=>$title,"author"=>$author,"tid"=>$tid,"replay"=>$replay);

}
return $result;

}
function getChinese($string) {
$tmpstr = ”;
$arr = array(1,2,3,4,5,6,7,8,9,0);
$strlen = strlen($string);
for($i=0; $i<$strlen; $i++) {
$str=substr($string, $i, 1);
$str1=trim($str);
if( ord($str)>0xA0 ){
$tmpstr.= substr($string, $i, 2);
$i = $i+1;
}

}
return $tmpstr;
}
function getChineseNum($string) {
$tmpstr = ”;
$arr = array(1,2,3,4,5,6,7,8,9,0);
$strlen = strlen($string);
for($i=0; $i<$strlen; $i++) {
$str=substr($string, $i, 1);
$str1=trim($str);
if( ord($str)>0xA0 ){
$tmpstr.= substr($string, $i, 2);
$i = $i+1;
}
if(is_numeric($str1)){
$tmpstr.= $str1;
}

}
return $tmpstr;
}

function getNum($string){
$tmpstr = ”;
$arr = array(1,2,3,4,5,6,7,8,9,0);
$strlen = strlen($string);
for($i=0; $i<$strlen; $i++) {
$str=substr($string, $i, 1);
$str1=trim($str);
if(is_numeric($str1)){
$tmpstr.= $str1;
}
}
return $tmpstr;
}

}

?>
[/php]

一个网页抓取的类支持get+post+cookie存储

以前做个贴吧发帖机,然后设定的的是发一贴需登录一次,最后账号被禁用了。最后就改了下程序,登录一次保存cookie信息,下次再发帖就不用登陆了。

[php]
<?php
class httpconnector {
private $curl;
private $cookie;
private $kv;
function __construct(){
$this->kv = new SaeKV();
$this->kv->init();
if($data=$this->kv->get("cookie"))
$this->cookie=$data;

}
public function get($url) {
$this->curl = curl_init();
curl_setopt($this->curl, CURLOPT_URL, $url);
curl_setopt($this->curl, CURLOPT_HEADER, 1);
curl_setopt($this->curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookie);
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($this->curl);
curl_close($this->curl);
preg_match_all("/Set-Cookie:(.*?);/", $data, $match, PREG_SET_ORDER);
foreach ($match as $r) {
if ($this->cookie != ”) {
$this->cookie = $this->cookie . ‘;’;
}
if (isset($r[1])) {
$this->cookie .= trim(str_replace("rn", "", $r[1]));
}
}
$this->kv->set("cookie",$this->cookie);
return $data;

}
public function post($url, $params) {
$this->curl = curl_init();
curl_setopt($this->curl, CURLOPT_URL, $url);
curl_setopt($this->curl, CURLOPT_HEADER, 1);
curl_setopt($this->curl, CURLOPT_COOKIE, $this->cookie);
curl_setopt($this->curl, CURLOPT_POST, 1);
curl_setopt($this->curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
curl_setopt($this->curl, CURLOPT_POSTFIELDS, $params);
curl_setopt($this->curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($this->curl);
curl_close($this->curl);
preg_match_all("/Set-Cookie:(.*?);/", $data, $match, PREG_SET_ORDER);
foreach ($match as $r) {
if ($this->cookie != ”) {
$this->cookie = $this->cookie . ‘;’;
}
if (isset($r[1])) {
$this->cookie .= trim(str_replace("rn", "", $r[1]));
}
}
$this->kv->set("cookie",$this->cookie);
return $data;

}
}
?>
[/php]

加载进度条

[html]
<style type="text/css">
body{
padding: 20px;
font-size: 14px;
}
#progressbar{
width: 278px;
}
#progressbar .border{
border: 1px solid #777;
width: 276px;
height: 13px;
padding: 1px;
}
#progressbar .bar{
background-color: #73c944;
width: 50%;
height: 13px;
overflow: hidden;
}
#progressbar .desc{
text-align: center;
font-size: 12px;
line-height: 24px;
}
</style>
<div id="progressbar">
<div class="border">
<div class="bar">&#160;</div>
</div>
<div class="desc">
正在加载…
</div>
</div>

<p>loading 将在 10s 后结束。</p>

<script type="text/javascript">
<!–
var loading = function(){
var ct = document.getElementById("progressbar"),
desc = ct.getElementsByTagName("div"),
idx = 0, time = 500, bar = desc[1], desc = desc[2];
bar.setValue = function(n){
this.style.width = n + "%"; };
void function(){
bar.setValue(idx += (100 – idx) * .2);
timer = setTimeout(arguments.callee, time += 100);
}();
return {
remove: function(){
clearTimeout(timer);
desc.innerHTML = "加载完成";
bar.setValue(100);
setTimeout(function(){
ct.parentNode.removeChild(ct);
}, 500);
}
};
}();

setTimeout(loading.remove, 10000);
//–>
</script>
[/html]

如何高效开发一个完整的网站

一个完整的网站包括两部分:1.业务逻辑2.界面,专业点的讲即M+V+C , M是model即模型存放网站业务逻辑,V是view 即视图,也是网站界面,C是control顾名思义联系model和view,控制显示。但是很多人在开发网站时M和C分的不是很清,我本人也经常犯这种错误,那么这样做有什么坏处呢,后面讲。

本文主要讲如何高效的开发一个完整的网站。先将业务逻辑的编写,首先,你要确定网站的需求,需要有哪些功能,考虑哪些功能需要在不同的场景多次使用(这样的场景毫无疑问要放在M中,为了维护方便保持、保持一致性),然后设计数据库,但是这时候设计只是简要的设计,不必苦思冥想,因为在你真正开始做的时候,就会发现,以前的设计会有缺陷,然后就一边改一边做。设计好数据库结构时开始写代码了,首先你要先写后台那部分,后台的每个功都可以概括如下:数据的增、删、改、查,因此如果针对每个数据表写SQL语句会很痛苦,所以要借助开发框架,国内比较出色的是Thinphp,我看了开发文档,不是多好,推荐国外的zendframework 和YII,这两个是世界上最有名的两个,功能极其强大,唯一的不足是,要阅读英文文档去学习,中文的大多是过时的。好了,借助框架,增删改查数据可以简化到极致,开发极其方便。后台写完了,就开始写前台了,前台主要是呈现数据传递信息,最好做到简单,并快速传递访客想获得数据,因此,前台的设计要好好考虑,当然前台的数据一般只设计查和增两个功能,查询是常见的,增主要是加些评论神马的。

后台写完了,你会发现虽然功能实现了,但是有点丑陋。所以,也是没法交付使用的,那我们开始写界面代码。以前我认为写界面靠ps,而后做了几个项目后,我发现我PS一点都不会,也可以设计的很好,这也主要借助了框架,我用的是外国的twitter工程师开发的bootstrap框架,有点遗憾的是它不支持IE,在IE下显示效果极差,幸运的是他们提供了一个解决在IE下显示不好的解决方案,详情见https://github.com/empowering-communities/Bootstrap-IE6 英文的,不过很简单,一读就明白了。到此,一个网站开发完了。

至于效率,我用数据说话,我这学期做了四个项目。第一个项目完全不用框架写了二十天,第四个项目全靠框架,功能差不多(比第一个简单点),只用了三天。但是第四个效果、性能、加载速度要远优于第一个。