Compare commits

...

3 commits

Author SHA1 Message Date
lolcat
9cd369ac08 http2 on ddg 2024-11-07 23:37:43 -05:00
lolcat
e83865be49 added pagination 2024-11-07 00:12:06 -05:00
lolcat
68dd7f29f6 mojeek thumbnail fix 2024-11-06 23:43:54 -05:00
3 changed files with 137 additions and 35 deletions

View file

@ -28,6 +28,9 @@ class ddg{
curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_URL, $url);
// http2 bypass
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
switch($reqtype){ switch($reqtype){
case self::req_web: case self::req_web:
$headers = $headers =
@ -36,27 +39,33 @@ class ddg{
"Accept-Encoding: gzip", "Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5", "Accept-Language: en-US,en;q=0.5",
"DNT: 1", "DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive", "Connection: keep-alive",
"Upgrade-Insecure-Requests: 1", "Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document", "Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate", "Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: cross-site", "Sec-Fetch-Site: same-origin",
"Upgrade-Insecure-Requests: 1"]; "Sec-Fetch-User: ?1",
"Priority: u=0, i",
"TE: trailers"];
break; break;
case self::req_xhr: case self::req_xhr:
$headers = $headers =
["User-Agent: " . config::USER_AGENT, ["User-Agent: " . config::USER_AGENT,
"Accept: */*", "Accept: application/json, text/javascript, */*; q=0.01",
"Accept-Encoding: gzip", "Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5", "Accept-Language: en-US,en;q=0.5",
"Connection: keep-alive", "Connection: keep-alive",
"Referer: https://duckduckgo.com/", "Referer: https://duckduckgo.com/",
"X-Requested-With: XMLHttpRequest", "X-Requested-With: XMLHttpRequest",
"DNT: 1", "DNT: 1",
"Sec-Fetch-Dest: script", "Sec-GPC: 1",
"Sec-Fetch-Mode: no-cors", "Connection: keep-alive",
"Sec-Fetch-Site: same-site"]; "Sec-Fetch-Dest: empty",
"Sec-Fetch-Mode: cors",
"Sec-Fetch-Site: same-origin",
"TE: trailers"];
break; break;
} }
@ -1889,12 +1898,12 @@ class ddg{
[$npt, $proxy] = $this->backend->get($get["npt"], "images"); [$npt, $proxy] = $this->backend->get($get["npt"], "images");
try{ try{
$json = json_decode($this->get( $json = $this->get(
$proxy, $proxy,
"https://duckduckgo.com/i.js?" . $npt, "https://duckduckgo.com/i.js?" . $npt,
[], [],
ddg::req_xhr ddg::req_xhr
), true); );
}catch(Exception $err){ }catch(Exception $err){
@ -1920,6 +1929,7 @@ class ddg{
$filter = []; $filter = [];
$get_filters = [ $get_filters = [
"hps" => "1",
"q" => $search, "q" => $search,
"iax" => "images", "iax" => "images",
"ia" => "images" "ia" => "images"
@ -1970,7 +1980,7 @@ class ddg{
} }
$vqd = $vqd[1]; $vqd = $vqd[1];
// @TODO: s param = image offset // @TODO: s param = image offset
$js_params = [ $js_params = [
"l" => $country, "l" => $country,
@ -1994,12 +2004,12 @@ class ddg{
} }
try{ try{
$json = json_decode($this->get( $json = $this->get(
$proxy, $proxy,
"https://duckduckgo.com/i.js", "https://duckduckgo.com/i.js",
$js_params, $js_params,
ddg::req_xhr ddg::req_xhr
), true); );
}catch(Exception $err){ }catch(Exception $err){
@ -2007,6 +2017,13 @@ class ddg{
} }
} }
$json = json_decode($json, true);
if($json === null){
throw new Exception("Failed to decode JSON");
}
$out = [ $out = [
"status" => "ok", "status" => "ok",
"npt" => null, "npt" => null,

View file

@ -220,6 +220,7 @@ class marginalia{
"related" => [] "related" => []
]; ];
// API scraper
if(config::MARGINALIA_API_KEY !== null){ if(config::MARGINALIA_API_KEY !== null){
try{ try{
@ -263,34 +264,57 @@ class marginalia{
return $out; return $out;
} }
// no more cloudflare!! Parse html by default // HTML parser
$params = [ $proxy = $this->backend->get_ip();
"query" => $search
];
foreach(["adtech", "recent", "intitle"] as $v){ if($get["npt"]){
if($get[$v] == "yes"){ [$params, $proxy] =
$this->backend->get(
$get["npt"],
"web"
);
try{
$html =
$this->get(
$proxy,
"https://search.marginalia.nu/search?" . $params
);
}catch(Exception $error){
switch($v){ throw new Exception("Failed to get HTML");
}
}else{
$params = [
"query" => $search
];
foreach(["adtech", "recent", "intitle"] as $v){
if($get[$v] == "yes"){
case "adtech": $params["adtech"] = "reduce"; break; switch($v){
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break; case "adtech": $params["adtech"] = "reduce"; break;
case "recent": $params["recent"] = "recent"; break;
case "adtech": $params["searchTitle"] = "title"; break;
}
} }
} }
}
try{
$html =
$this->get(
$this->backend->get_ip(),
"https://search.marginalia.nu/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML"); try{
$html =
$this->get(
$proxy,
"https://search.marginalia.nu/search",
$params
);
}catch(Exception $error){
throw new Exception("Failed to get HTML");
}
} }
$this->fuckhtml->load($html); $this->fuckhtml->load($html);
@ -387,6 +411,65 @@ class marginalia{
]; ];
} }
// get next page
$this->fuckhtml->load($html);
$pagination =
$this->fuckhtml
->getElementsByAttributeValue(
"aria-label",
"pagination",
"nav"
);
if(count($pagination) === 0){
// no pagination
return $out;
}
$this->fuckhtml->load($pagination[0]);
$pages =
$this->fuckhtml
->getElementsByClassName(
"page-link",
"a"
);
$found_current_page = false;
foreach($pages as $page){
if(
stripos(
$page["attributes"]["class"],
"active"
) !== false
){
$found_current_page = true;
continue;
}
if($found_current_page){
// we found current page index, and we iterated over
// the next page <a>
$out["npt"] =
$this->backend->store(
parse_url(
$page["attributes"]["href"],
PHP_URL_QUERY
),
"web",
$proxy
);
break;
}
}
return $out; return $out;
} }
} }

View file

@ -701,9 +701,11 @@ class mojeek{
if(count($thumb) === 2){ if(count($thumb) === 2){
$answer["thumb"] = $answer["thumb"] =
$this->fuckhtml urldecode(
->getTextContent( $this->fuckhtml
$thumb[1] ->getTextContent(
$thumb[1]
)
); );
} }
} }