IT/PHP

[PHP] 웹크롤링 Snoopy

월공 2020. 12. 4. 08:25
728x90
300x250

<? include_once 'Snoopy.class.php'; ?>

<?

 

$url = "크롤링 하고싶은 URL";

 

$snoopy = new snoopy;

$snoopy->fetch("$url");

$txt = $snoopy->results;

//print_r($txt);

 

preg_match( '@<meta\s+http-equiv="Content-Type"\s+content="([\w/]+)(;\s+charset=([^\s"]+))?@i',$txt, $matches );

 

//해당 사이트 charset 이 euc-kr 이면 utf-8로 변환 해준다.

if($matches[3] == "euc-kr" || $matches[3] == "EUC-KR" ){

$txt = iconv("euc-kr", "utf-8", $txt);

}

 

preg_match( '#<meta .*charset="([^"]+)">#siU', $txt, $charset);

preg_match("/<title>(.*)<\/title>/siU", $txt, $title);

preg_match( '#<meta property="og:title".*content="([^"]+)">#siU', $txt, $ogTitle);

preg_match( '#<meta property="og:url".*content="([^"]+)">#siU', $txt, $crawling_url);

preg_match( '#<meta property="og:image".*content="([^"]+)">#siU', $txt, $ogImage);

preg_match('/\<meta property\="og:image" content\="(.*)"\s?\/?\>/', $txt, $ogImage2);

preg_match('/\<meta property\="og:description" content\="(.*)"\s?\/?\>/', $txt, $des);

preg_match('/\<meta name\="keywords" content\="(.*)"\s?\/?\>/', $txt, $keywords);

//preg_match( '#<meta name="keywords".*content="([^"]+)">#siU', $txt, $keywords);

 

echo $matches[3];

echo "<br>";

echo "그냥 타이틀 : " .$title[1];

echo "<br>";

echo "og 타이틀 : " .$ogTitle[1];

echo "<br>";

echo "URL : " .$crawling_url[1];

echo "<br>";

 

if($ogImage[1]){

?>

이미지 : <img src="<?=$ogImage[1]?>" alt="" width="150" height="150">

<?

}else if($ogImage2[1]){

?>

이미지 : <img src="<?=$ogImage2[1]?>" alt="" width="150" height="150">

<?

}

echo "<br>";

echo "사이트 설명 : " .$des[1];

echo "<br>";

echo "키워드 : " .$keywords[1];

 

?>

 

 

728x90
300x250