Array( '10.10.10.111:80', '190.10.10.111:80', '211.10.10.111:80', '20.10.10.111:80' ), [8080] => Array( '10.10.10.111:8080', '190.10.10.111:8080', '211.10.10.111:8080', '20.10.10.111:8080' ) ) */ $untested_proxies = samair_proxies(); $proxies = array(); foreach( $untested_proxies as $testlist ) { $proxies = array_merge( $proxies, proxy_test( $testlist, 'http://google.com', 5 )); } print_r( $proxies ); /* Function Definitions ------------------------------------------ */ // This function grabs all the proxies from SAMAIR // and returns then in an array indexed by PORT the proxies listen too function samair_proxies() { // grab and output the SAMIAR proxy lists $all_proxies = array(); $pages = 20; for( $i = 0; $i < $pages; $i++ ) { $url = 'http://www.samair.ru/proxy/proxy-' . $i . '.htm'; $ch = curl_init(); curl_setopt( $ch, CURLOPT_URL, $url ); curl_setopt( $ch, CURLOPT_USERAGENT, '' ); curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 ); // curl_setopt( $ch, CURLOPT_PROXY, '' ); $data = curl_exec( $ch ); $data = strip_tags( $data ); // chop the top preg_match_all( '/\d+\.\d+\.\d+\.\d+:\d+/', $data, $these_proxies ); $all_proxies = array_merge( $all_proxies, $these_proxies[0] ); } $proxies_count = count( $all_proxies ); $proxies_by_port = array(); for( $i = 0; $i < $proxies_count; $i++ ) { $proxy = explode( ':', $all_proxies[$i] ); if( $proxies_by_port[ $proxy[1] ] ) { array_push( $proxies_by_port[$proxy[1]] , $all_proxies[$i] ); } else { $proxies_by_port[ $proxy[1] ] = array(); array_push( $proxies_by_port[ $proxy[1] ] , $all_proxies[$i] ); } } return $proxies_by_port; } /* This tests an array of proxies The array needs to list the proxies in XXX.XXX.XXX.XXX:PORT format $proxies - array of proxies in XXX.XXX.XXX.XXX:PORT format $test_url - url to test the proxy with ie 'http://www.google.com' $maxtime - maximum time in seconds that a proxy is allowed to respond, if it returns later than this don't list it $test_expression - regular expression to test the returned text from the webpage to ensure its valid */ function proxy_test( $proxies, $test_url, $maxtime = 5, $test_expression = '' ) { if( is_array( $proxies ) ) { // array of proxie sips and ports as expected } else { // scalar // check for value and push onto array if( $proxies ) { $temp = $proxies; $proxies = array(); array_push( $proxies, $temp ); } } // store proxies that meet out criteria $sat_proxies = array(); foreach( $proxies as $ip ) { $ch = curl_init(); curl_setopt( $ch, CURLOPT_URL, $test_url ); curl_setopt( $ch, CURLOPT_USERAGENT, 'USER AGENT HERE' ); curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 ); curl_setopt( $ch, CURLOPT_CONNECTTIMEOUT, $maxtime ); // time to wait for response set to 5 seconds initially, a bit high IMO // proxy to test curl_setopt( $ch, CURLOPT_PROXY, $ip ); $time_start = time(); $data = curl_exec( $ch ); $time_finish = time(); $response_time = $time_finish - $time_start; $good_proxy = 0; if( $response_time < ( $maxtime + 1 ) ) { // if a regular expression was passed // we need to test for it in the data returned // this ensures that the proxy inst' just sending // a default message page back if( $test_expression ) { if( preg_match( $test_expresion, $data ) ) { // response time was good and data was found on page $good_proxy = 1; } } else { // response time was good $good_proxy = 1; } } if( $good_proxy ) { array_push( $sat_proxies, $ip ); } } return $sat_proxies; } ?>