Weitere ähnliche Inhalte Ähnlich wie Web Scraping with PHP (20) Mehr von Matthew Turland (14) Kürzlich hochgeladen (20) Web Scraping with PHP10. With plain text, we give ourselves the ability to manipulate knowledge, both manually and programmatically, using virtually every tool at our disposal. 3.14 The Power of Plain Text, The Pragmatic Programmer 18. The Web We Weave GET / HTTP/1.1 User-Agent: ... HTTP/1.1 200 OK Content-Type: ... 19. GET /index.php?foo=bar HTTP/1.1 <a href= "/index.php?foo=bar" > Index </a> <form method= "post" action= "/index.php" > <input name= "foo" value= "bar" /> </form> POST /index.php HTTP/1.1 foo = bar Browsing -> Requests 20. HTTP/1.1 200 OK Content-Type : image/gif Content-Length: 8558 Responses -> Rendered Elements <img src= "/intl/en_ALL/images/logo.gif" /> GET /intl/en_ALL/images/logo.gif HTTP/1.1 Host: google.com 31. Simple Streams Example $uri = 'http://www.example.com/some/resource' ; $get = file_get_contents( $uri ); $context = stream_context_create( array ( 'http' => array ( 'method' => 'POST' , 'header' => 'Content-Type: ' . 'application/x-www-form-urlencoded' , 'content' => http_build_query( array ( 'var1' => 'value1' , 'var2' => 'value2' )) ) ) ); $post = file_get_contents( $uri , false, $context ); 32. pecl_http Example $http = new HttpRequest( $uri ); $http ->enableCookies(); $http ->setMethod(HTTP_METH_POST); $http ->addPostFields( array ( 'var1' => 'value1' )); $http ->setOptions( 'useragent' => 'PHP ' . phpversion (), 'referer' => 'http://example.com/some/referer' )); $response = $http -> send (); $headers = $response ->getHeaders(); $body = $response ->getBody(); 33. pecl_http Request Pooling $pool = new HttpRequestPool; foreach ( $urls as $url ) { $request = new HttpRequest( $url , HTTP_METH_GET); $pool ->attach( $request ); } $pool -> send (); foreach ( $pool as $request ) { echo $request ->getUrl(), PHP_EOL; echo $request ->getResponseBody(), PHP_EOL; } 36. Tidy Extension $config = array ( 'output-xhtml' => true); $tidy = tidy_parse_string( $markupString , $config ); $tidy = tidy_parse_file( $markupFilePath , $config ); $output = tidy_get_output( $tidy ); 37. DOM Extension $doc = new DOMDocument; $doc ->loadHTML( $htmlString ); $doc ->loadHTMLFile( $htmlFilePath ); $listItems = $doc ->getElementsByTagName( 'li' ); $xpath = new DOMXPath( $doc ); $listItems = $xpath ->query( '//ul/li' ); foreach ( $listItems as $listItem ) { echo $listItem ->nodeValue, PHP_EOL; } 38. SimpleXML Extension $sxe = new SimpleXMLElement( $markupString ); $sxe = new SimpleXMLElement( $filePath , null, true); echo $sxe ->body->ul->li[0], PHP_EOL; $children = $sxe ->body->ul->li; $children = $sxe ->body->ul->children(); foreach ( $children as $li ) { echo $li , PHP_EOL; } echo $sxe ->body->ul[ 'id' ]; $attributes = $sxe ->body->ul->attributes(); foreach ( $attributes as $name => $value ) { echo $name , '=' , $value , PHP_EOL; } 39. XMLReader Extension $doc = XMLReader::xml( $xmlString ); $doc = XMLReader::open( $filePath ); while ( $doc -> read ()) { if ( $doc ->nodeType == XMLReader::ELEMENT) { var_dump ( $doc ->localName); var_dump ( $doc ->hasValue); var_dump ( $doc ->value); var_dump ( $doc ->hasAttributes); var_dump ( $doc ->getAttribute( 'id' )); } }