{"id":58,"date":"2013-04-14T03:35:18","date_gmt":"2013-04-14T01:35:18","guid":{"rendered":"https:\/\/www.aleskrejci.cz\/blog\/?p=58"},"modified":"2013-04-14T03:35:18","modified_gmt":"2013-04-14T01:35:18","slug":"parsovani-stranky-v-php","status":"publish","type":"post","link":"https:\/\/www.aleskrejci.cz\/blog\/2013\/04\/14\/parsovani-stranky-v-php\/","title":{"rendered":"Parsov\u00e1n\u00ed str\u00e1nky v PHP"},"content":{"rendered":"<p>Mus\u00edme m\u00edt nainstalovanou knihovnu\u00a0php5-curl.<\/p>\n<pre class=\"brush: bash; title: ; notranslate\" title=\"\">sudo apt-get install php5-curl\nsudo service apache2 restart<\/pre>\n<p>Vytvo\u0159\u00edme si nap\u0159\u00edklad soubor cURLapi.php<\/p>\n<pre class=\"brush: php; title: ; notranslate\" title=\"\">&lt;?php\n\n\/* Pozn.:\n * apt-get install php5-curl \n *\/\n\nfunction cURL_getPage($url) {\n    $curl = curl_init();\n\n    \/* BEGIN: userAgent *\/\n    $userAgent = &quot;Firefox (WindowsXP) - Mozilla\/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko\/20070725 Firefox\/2.0.0.6&quot;;\n    \/* END: userAgent *\/\n\n    \/* BEGIN: header *\/\n    $httpHeaderArray = array(\n        &quot;Accept: text\/xml,application\/xml,application\/xhtml+xml,text\/html;q=0.9&quot;,\n        &quot;Cache-Control: max-age=0&quot;,\n        &quot;Connection: keep-alive&quot;,\n        &quot;Keep-Alive: 300&quot;,\n        &quot;Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7&quot;,\n        &quot;Accept-Language: en-us,en;q=0.5&quot;,\n        &quot;Pragma: &quot;\n    );\n    \/* END: header *\/\n\n    \/* BEGIN: referer *\/\n    $refererLink = &quot;https:\/\/www.google.com\/search?q=&quot; . $url;\n    \/* END: referer *\/\n\n    curl_setopt($curl, CURLOPT_URL, $url);  \/\/ url\n    curl_setopt($curl, CURLOPT_SSLVERSION, 3);\n    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);\n    curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2);\n    curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);  \/\/ userAgent\n    curl_setopt($curl, CURLOPT_HTTPHEADER, $httpHeaderArray);  \/\/ hlavicka\n    curl_setopt($curl, CURLOPT_REFERER, $refererLink);  \/\/ referer\n    curl_setopt($curl, CURLOPT_HEADER, FALSE);  \/\/ zobrazi nam serverou hlavicku (TRUE\/FALSE)\n    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);  \/\/ vrati nam webovou stranku (TRUE\/FALSE)\n    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, TRUE);  \/\/ povoli presmerovani v url (TRUE\/FALSE)\n    curl_setopt($curl, CURLOPT_AUTOREFERER, TRUE);  \/\/ povoli posilani referecniho odkazu (TRUE\/FALSE)\n    curl_setopt($curl, CURLOPT_ENCODING, &quot;&quot;);\n    curl_setopt($curl, CURLOPT_MAXREDIRS, 10);  \/\/ zastavi stahovani po 10 presmerovanich\n    curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 120);  \/\/ timeout pro pripojeni\n    curl_setopt($curl, CURLOPT_TIMEOUT, 10);  \/\/ timeout na odpoved\n    \/* Proxy *\/\n    \/\/ curl_setopt($curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5);\n    \/\/ curl_setopt($curl, CURLOPT_PROXY, &quot;127.0.0.1&quot;);\n    \/\/ curl_setopt($curl, CURLOPT_PROXYPORT, 9050);\n\n    $html = curl_exec($curl);  \/\/ provede curl prikazy\n    curl_close($curl);  \/\/ zavre spojeni\n    return $html;  \/\/ vrati webovou stranku\n}<\/pre>\n<p>A do indexu.php<\/p>\n<pre class=\"brush: php; title: ; notranslate\" title=\"\">&lt;?php\n\nrequire(&#039;cURLapi.php&#039;);\nheader(&#039;Content-Type: text\/html; charset=utf-8&#039;); \/\/ Vytiskne parsovany web v UTF-8\n\n$url = &quot;http:\/\/nejakaUrl.tld&quot;;\n$page = cURL_getPage($url);\n\n$page = htmlspecialchars($page, ENT_QUOTES); \/\/ Priprava pro strojove zpracovani - zobrazeni kodu. Zakomentovanim tohoto radku se nam webova stranka zobrazi klasicky.\necho &#039;&lt;pre&gt;&#039; . $page . &#039;&lt;\/pre&gt;&#039;;<\/pre>\n<p>A kr\u00e1sn\u011b n\u00e1m to str\u00e1nku tah\u00e1 \ud83d\ude09 , to\u0165 v\u0161e.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>Mus\u00edme m\u00edt nainstalovanou knihovnu\u00a0php5-curl. sudo apt-get install php5-curl sudo service apache2 restart Vytvo\u0159\u00edme si nap\u0159\u00edklad soubor cURLapi.php &lt;?php \/* Pozn.: * apt-get install php5-curl *\/ function cURL_getPage($url) { $curl = curl_init(); \/* BEGIN: userAgent *\/ $userAgent = &quot;Firefox (WindowsXP) &#8211; Mozilla\/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko\/20070725 Firefox\/2.0.0.6&quot;; \/* END: userAgent *\/ \/* [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[37],"tags":[36,34,35],"class_list":["post-58","post","type-post","status-publish","format-standard","hentry","category-php","tag-web-scraping","tag-php","tag-curl"],"_links":{"self":[{"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/posts\/58","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/comments?post=58"}],"version-history":[{"count":0,"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/posts\/58\/revisions"}],"wp:attachment":[{"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/media?parent=58"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/categories?post=58"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.aleskrejci.cz\/blog\/wp-json\/wp\/v2\/tags?post=58"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}