mirror-4get/scraper/greppr.php
2024-05-16 19:59:43 -04:00

429 lines
7.8 KiB
PHP

<?php
class greppr{
public function __construct(){
include "lib/backend.php";
$this->backend = new backend("greppr");
include "lib/fuckhtml.php";
$this->fuckhtml = new fuckhtml();
}
public function getfilters($page){
return [];
}
private function get($proxy, $url, $get = [], $cookie = false){
$curlproc = curl_init();
if($get !== []){
$get = http_build_query($get);
$url .= "?" . $get;
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
if($cookie === false){
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
}else{
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
"Cookie: PHPSESSID=" . $cookie,
"DNT: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none",
"Sec-Fetch-User: ?1"]
);
}
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
$this->backend->assign_proxy($curlproc, $proxy);
$headers = [];
curl_setopt(
$curlproc,
CURLOPT_HEADERFUNCTION,
function($curlproc, $header) use (&$headers){
$len = strlen($header);
$header = explode(':', $header, 2);
if(count($header) < 2){
// ignore invalid headers
return $len;
}
$headers[strtolower(trim($header[0]))] = trim($header[1]);
return $len;
}
);
$data = curl_exec($curlproc);
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
}
curl_close($curlproc);
return [
"headers" => $headers,
"data" => $data
];
}
public function web($get, $first_attempt = true){
if($get["npt"]){
[$q, $proxy] = $this->backend->get($get["npt"], "web");
$q = json_decode($q, true);
}else{
$search = $get["s"];
if(strlen($search) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
}
// get token
// token[0] = static token that changes once a day
// token[1] = dynamic token that changes on every request
// token[1] = PHPSESSID cookie
$tokens = apcu_fetch("greppr_token");
if(
$tokens === false ||
$first_attempt === false // force token fetch
){
// we haven't gotten the token yet, get it
try{
$response =
$this->get(
$proxy,
"https://greppr.org",
[]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search tokens");
}
$tokens = $this->parse_token($response);
if($tokens === false){
throw new Exception("Failed to grep search tokens");
}
}
try{
if($get["npt"]){
$params = [
$tokens[0] => $q["q"],
"s" => $q["s"],
"l" => 30,
"n" => $tokens[1]
];
}else{
$params = [
$tokens[0] => $search,
"n" => $tokens[1]
];
}
$searchresults = $this->get(
$proxy,
"https://greppr.org/search",
$params,
$tokens[2]
);
}catch(Exception $error){
throw new Exception("Failed to fetch search page");
}
if(strlen($searchresults["data"]) === 0){
// redirected to main page, which means we got old token
// generate a new one
// ... unless we just tried to do that
if($first_attempt === false){
throw new Exception("Failed to get a new search token");
}
return $this->web($get, false);
}
// refresh the token with new data (this also triggers fuckhtml load)
$this->parse_token($searchresults, $tokens[2]);
// response object
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
// get results for later
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"div"
);
// check for next page
$next_elem =
$this->fuckhtml
->getElementsByClassName(
"pagination",
"ul"
);
if(count($next_elem) !== 0){
$this->fuckhtml->load($next_elem[0]);
$as =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$break = false;
foreach($as as $a){
if($break === true){
parse_str(
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
$values
);
$values = array_values($values);
$out["npt"] =
$this->backend->store(
json_encode(
[
"q" => $values[0],
"s" => $values[1]
]
),
"web",
$proxy
);
break;
}
if($a["attributes"]["href"] == "#"){
$break = true;
}
}
}
// scrape results
foreach($results as $result){
$this->fuckhtml->load($result);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
)[0];
$description =
$this->fuckhtml
->getElementsByFuzzyAttributeValue(
"style",
"color:#777777;",
"p"
);
if(count($description) === 0){
$description = null;
}else{
$description =
$this->fuckhtml
->getTextContent(
$description[0]
);
}
$date =
$this->fuckhtml
->getElementsByTagName(
"p"
);
$date =
strtotime(
explode(
"Added:",
$this->fuckhtml
->getTextContent(
$date[count($date) - 1]["innerHTML"]
)
)[1]
);
$out["web"][] = [
"title" =>
$this->fuckhtml
->getTextContent(
$a["innerHTML"]
),
"description" => $description,
"url" =>
$this->fuckhtml
->getTextContent(
$a["attributes"]["href"]
),
"date" => $date,
"type" => "web",
"thumb" => [
"url" => null,
"ratio" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
private function parse_token($response, $cookie = false){
$this->fuckhtml->load($response["data"]);
$scripts =
$this->fuckhtml
->getElementsByTagName("script");
$found = false;
foreach($scripts as $script){
preg_match(
'/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
$script["innerHTML"],
$tokens
);
if(isset($tokens[1])){
$found = true;
break;
}
}
if($found === false){
return false;
}
$tokens = [
$tokens[1],
$tokens[2]
];
if($cookie !== false){
// we already specified a cookie, so use the one we have already
$tokens[] = $cookie;
apcu_store("greppr_token", $tokens);
return $tokens;
}
if(!isset($response["headers"]["set-cookie"])){
// server didn't send a cookie
return false;
}
// get cookie
preg_match(
'/PHPSESSID=([^;]+)/',
$response["headers"]["set-cookie"],
$cookie
);
if(!isset($cookie[1])){
// server sent an unexpected cookie
return false;
}
$tokens[] = $cookie[1];
apcu_store("greppr_token", $tokens);
return $tokens;
}
}