博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
一个爬虫
阅读量:7101 次
发布时间:2019-06-28

本文共 7184 字,大约阅读时间需要 23 分钟。

'; $myfile = fopen('D:\歌词.txt.txt','r'); echo '1'; $info = []; $num = 0; $number = 0; while($line = fgets($myfile)){ //获取用户名 $net_name_index = strpos($line,'用户名:'); $net_name_end = strpos($line,'email:',$net_name_index); $net_name = trim(substr($line,$net_name_index+strlen('用户名:'),$net_name_end-($net_name_index+strlen('用户名:')))); //获取email $email_index = strpos($line,'email:',$net_name_end); $email_end = strpos($line,'真名:',$email_index); $email = trim(substr($line,$email_index+strlen('email:'),$email_end-($email_index+strlen('email:')))); //获取真名 $name_index = strpos($line,'真名:',$email_end); $name_end = strpos($line,'身份证号:',$name_index); $name = trim(substr($line,$name_index+strlen('真名:'),$name_end-($name_index+strlen('真名:')))); //获取身份证号 $idCard_index = strpos($line,'身份证号:',$name_end); $idCard_end = strpos($line,'绑定手机号',$idCard_index); $idCard = trim(substr($line,$idCard_index+strlen('身份证号:'),$idCard_end-($idCard_index+strlen('身份证号:')))); if(strlen($idCard)!=18){ continue; } $number = $number+1; //获取手机号 $phone_number_index = strpos($line,'绑定手机号',$idCard_end); $phone_number_end = strpos($line,'账户可',$phone_number_index); $phone_number = trim(substr($line,$phone_number_index+strlen('绑定手机号'),$phone_number_end-($phone_number_index+strlen('绑定手机号')))); //获取银行卡号 $bankCard_index = strpos($line,'行卡号:',$phone_number_end); $bankCard_end = strpos($line,'银行:',$bankCard_index); $bankCard = trim(substr($line,$bankCard_index+strlen('行卡号:'),$bankCard_end-($bankCard_index+strlen('行卡号:')))); //这么多重复代码。我甚至可以写个类 //抓取身份证号信息集 $idCrad_url = 'http://qq.ip138.com/idsearch/index.asp?action=idcard&userid='.$idCard; $idCrad_curl = curl($idCrad_url,'gb2312'); $idCard_result = getIDinfo($idCrad_curl); $idnex = $num++; if(strlen($bankCard)>15&&strlen($bankCard)<20){ $bankCard_url = 'http://www.cardcn.com/search.php?word='.$bankCard; $bankCard_curl = curl($bankCard_url); if(substr_count($bankCard_curl,'对不起')==0){ $bankCard_result = getBankinfo($bankCard_curl); $info[$idnex]['bankCard_info'] = $bankCard_result; } } $info[$idnex]['net_name'] = $net_name; $info[$idnex]['email'] = $email; $info[$idnex]['name'] = $name; $info[$idnex]['idCard'] = $idCard; $info[$idnex]['phone_number'] = $phone_number; $info[$idnex]['bankCard'] = $bankCard; $info[$idnex]['idCrad_info'] = $idCard_result; } cl_slqi($info); echo $number; }//$url :html链接//return :解析后的html文档(字符串)//获取CURL请求的输出信息,这个可以爬取https,非常好function curl($url,$coding='utf-8') { //初始化 $ch = curl_init(); //设置选项,包括url curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0);//不返回response头部信息 curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); //TRUE 将curl_exec()获取的信息以字符串返回,而不是直接输出。 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); //支持重定向 //不验证证书和host curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); $result = curl_exec($ch); //释放curl句柄 curl_close($ch); //如果网站不是utf-8编码的话要转码 if($coding!='utf-8'){ $result= iconv($coding,"utf-8//IGNORE",$result); } return $result; }//处理并返回身份证信息function getIDinfo($crul){ $sex_index = strpos($crul,'别:'); $date_index = strpos($crul,'生日期:',$sex_index); $idcard_place_index = strpos($crul,';地:',$date_index); $idcard_place_end = strpos($crul,'
'),4)); $id_info['sex'] = trim(substr($crul,$sex_index+strlen('别:'),3)); $id_info['idCard_space'] = trim(substr($crul,$idcard_place_index+strlen(';地:'),$idcard_place_end-($idcard_place_index+strlen(';地:')))); return $id_info;}//处理并返回银行卡信息function getBankinfo($bank_crul){ $bank_info = []; //银行卡归属地 $back_space_index = strpos($bank_crul,'e">归属信息:'); $back_space_end = strpos($bank_crul,'',$back_space_index); $bank_info['back_space'] = trim(substr($bank_crul,$back_space_index+strlen('e">归属信息:'),$back_space_end-($back_space_index+strlen('e">归属信息:')))); //银行名称 $bank_name_index = strpos($bank_crul,'e">银行名称:',$back_space_end); $bank_name_end = strpos($bank_crul,'',$bank_name_index); $bank_info['bank_name'] = trim(substr($bank_crul,$bank_name_index+strlen('e">银行名称:'),$bank_name_end-($bank_name_index+strlen('e">银行名称:')))); //银行卡名称 $bankCard_name_index = strpos($bank_crul,'e">银行卡名:',$bank_name_end); $bankCard_name_end = strpos($bank_crul,'',$bankCard_name_index); $bank_info['bankCard_name'] = trim(substr($bank_crul,$bankCard_name_index+strlen('e">银行卡名:'),$bankCard_name_end-($bankCard_name_index+strlen('e">银行卡名:')))); //银行卡种类 $bank_info['bank_kind'] = getKeyWord($bank_crul,'
银行卡种:','
'); return $bank_info; }//截取有用的子串(爬虫相关)//$info=网页 $first_key=开始的字符串 $last_key=结束的字符串//return 中间的字符串;function getKeyWord($info,$first_key,$last_key){ $len = strlen($first_key); $first_key_start = strpos($info,$first_key); $last_key_start = strpos($info,$last_key,$first_key_start); $keyword = trim(substr($info,$first_key_start+$len,$last_key_start-$first_key_start-$len)); return $keyword;}//把数据写入到数据库function cl_slqi($arr){ $con = mysqli_connect('localhost','root','root','aiqiyi'); if(!$con){ die('could not connect'); } $temp = 0; foreach($arr as $value=>$key){ if(!isset($key['bankCard_info'])){ $sql = "insert into info(name,idCard,idCard_space,sex,date,net_name,email,phone_number) values('{
$key['name']}','{
$key['idCard']}','{
$key['idCrad_info']['idCard_space']}','{
$key['idCrad_info']['sex']}','{
$key['idCrad_info']['date']}','{
$key['net_name']}','{
$key['email']}','{
$key['phone_number']}')"; }else{ $sql = "insert into info(name,idCard,idCard_space,sex,date,net_name,email,phone_number,bankCard,back_name,bankCard_name,back_kind,back_space) values('{
$key['name']}','{
$key['idCard']}','{
$key['idCrad_info']['idCard_space']}','{
$key['idCrad_info']['sex']}','{
$key['idCrad_info']['date']}','{
$key['net_name']}','{
$key['email']}','{
$key['phone_number']}','{
$key['bankCard']}','{
$key['bankCard_info']['bank_name']}','{
$key['bankCard_info']['bankCard_name']}','{
$key['bankCard_info']['bank_kind']}','{
$key['bankCard_info']['back_space']}')"; } if(mysqli_query($con,$sql)){ echo 'insert成功!这是第'.$temp.'个成功!'; $temp++; echo "\n"; }else{ echo 'insert失败!';echo "\n"; } }}?>

 

转载于:https://www.cnblogs.com/cl94/p/9020751.html

你可能感兴趣的文章
线程和线程池
查看>>
Camstar开发常用数据库表及其关联
查看>>
html中的一些按钮之类的操作
查看>>
走进 AQS 瞧一瞧看一看
查看>>
NO18 linux开机自启动设置--开机流程--中文乱码--查看行数
查看>>
Java的四种内部类
查看>>
10-16C#for...循环语句(2)
查看>>
CentOS查看软件源提供的软件版本命令
查看>>
caffe 学习记录1及网络结构
查看>>
html5学习笔记——html新增属性(四)
查看>>
收藏的链接
查看>>
【原创】5月份月会总结
查看>>
手机号码归属地查询
查看>>
IO和socket编程
查看>>
Docker结合Jenkins构建持续集成环境
查看>>
一些Android经验
查看>>
Java设计模式-适配器模式
查看>>
求任意数阶乘最后一位
查看>>
android 循环操作
查看>>
Promise & Deferred objects in JavaScript Pt.1: Theory and Semantics.
查看>>