随手搞的采集某政府信息网的table表格数据

随手搞的采集某政府信息网的table表格数据,总结一句话,政府网站的太多都是table表格,会采集的非常简单,不会用技巧采集的就非常蛋疼了。。。。

<?php
mysql_connect("localhost","root","");
mysql_select_db("db");
mysql_query("set names gbk");

$page = isset($_GET['page']) ? $_GET['page'] :1; 
if($page>135){
	die("ok");
}
$url = "xxxxxxxxxxxxx.jsp?item_id=0701&comp_iswl=0&flag=T&IntCurPage=".$page;

$data = getcontents($url);
//抓取具体连接:

preg_match_all("/<td><a href=\"search_info(.*)\">(.*)<\/a><\/td>/imsU",$data,$urls);

//.jsp?item_id=0701&comp_id=A08009062414062914
$urlArr = $urls[1];

$html = "";
foreach($urlArr as $v){
	$url = "xxxxx/search_info".$v;
	$content = getcontents($url);
	preg_match_all("/class=\"font03\" 
style=\"text-align:center\">(.*)业绩信息<\/strong><\/th>/imsU",$content,$contentArr);
	$res = get_td_array($contentArr[1][0]);
	//$count = count($res);
	$html .= "<tr>";
	$html .= "<td>".$res[1][1]."</td>";
	//$html .=  "<td>".$res[2][1]."</td>";
	$html .=  "<td>".$res[2][3]."</td>";
	$html .=  "<td>".$res[3][1]."</td>";
	//$html .=  "<td>".$res[3][3]."</td>";
	$html .=  "<td>".$res[4][1]."</td>";
	$html .=  "<td>".$res[4][3]."</td>";
	//$html .=  "<td>".$res[5][1]."</td>";
	//$html .=  "<td>".$res[6][1]."</td>";
	//$html .=  "<td>".$res[7][1]."</td>";
	$html .=  "<td>".$res[8][1]."</td>";
	$html .=  "<td>".$res[9][1]."</td>";
	
	

	//$start = 11;
	//$end = $count-1;
	//处理资质信息这块
	$tdcount = 0;
	$vv = 8;
	$sqlkey =$sqlval= "";
	foreach($res as $k=>$v){
		if($k>11 && $res[$k]){
			$tdcount = $tdcount + 1;
			$tmp = implode("  ",$res[$k]);
			$tmp2 = str_replace("&nbsp;","  ",$tmp);
			$html .=  "<td>".$tmp2."</td>";
			$sqlkey .= ",v".$vv;
			$sqlval  .= ",'$tmp2'";
			$vv++;
		}
	}
	$html .= "</tr>";

	$sql = "insert into caiji (id,v1,v2,v3,v4,v5,v6,
v7".$sqlkey.") values ('','".$res[1][1]."','".$res[2][3]."','".$res[3][1]."'
,'".$res[4][1]."','".$res[4][3]."','".$res[8][1]."','".$res[9][1]."'".$sqlval.") ";
	mysql_query($sql);
}

echo "<table width=200%>";
/*echo "<tr><td>企业名称:</td><td>法定代表人:</td><td>企业类型:
</td><td>经营负责人:</td><td>技术负责人:</td><td>经营地址:</td><td>企业联系人:</td>";
for($i=1;$i<=10;$i++){
	echo "<td>资质".$i."</td>";
}
echo "</tr>";*/
echo $html;
echo "</table>";






//抓取table
function get_td_array($table) {
	// 去掉 HTML 标记属性
	$table = preg_replace("'<table[^>]*?>'si", "", $table);
	$table = preg_replace("'<tr[^>]*?>'si", "", $table);
	$table = preg_replace("'<td[^>]*?>'si", "", $table);
	$table = str_replace("</tr>", "{tr}", $table);
	$table = str_replace("</td>", "{td}", $table);
	// 去掉 HTML 标记
	
	$table = preg_replace("'<[\/\!]*?[^<>]*?>'si", "", $table);
	
	// 去掉空白字符
	$table = preg_replace("'([\r\n])[\s]+'", "", $table);
	$table = str_replace(" ", "", $table);
	$table = str_replace(" ", "", $table);
	
	$table = explode('{tr}', $table);
	array_pop($table);
	foreach ($table as $key => $tr) {
		$td = explode('{td}', $tr);
		array_pop($td);
		$td_array[] = $td;
	} 
	return $td_array;
}


//get抓取页面
function getcontents($url) {
	$ch = curl_init(); 
	$timeout = 5; 
	curl_setopt($ch, CURLOPT_URL, $url); 
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
	$contents = curl_exec($ch); 
	return $contents;
}

//post提交数据 模拟采集页面
function curlPost($url,$postData=array()) {
	if(empty($url)) return false;
	$o="";
	foreach ($postData as $k=>$v){
		$o.= "$k=".urlencode($v)."&";
	}
	$postData=substr($o,0,-1);
	$ch = curl_init();

	$timeout = 5; 
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
	curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
	
	curl_setopt($ch, CURLOPT_POST, 1);
	curl_setopt($ch, CURLOPT_HEADER, 0);
	curl_setopt($ch, CURLOPT_URL, $url); 
	curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
	$contents = curl_exec($ch); 
	return $contents;
}

?>
<script type="text/javascript">location.href="get.php?page=<?php echo ($page+1);?>"</script>

关键词: table表格 , 采集

上一篇: ssh连接经常超时和ssh连接很慢的原因分析及解决方法
下一篇: 解决javascript下location跳转获取不倒来源referer的方法

目前还没有人评论,您发表点看法?
发表评论

评论内容 (必填):