fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd

This commit is contained in:
lolcat 2024-08-08 03:29:29 -04:00
parent 36993013e5
commit fbac3eeb8d
2 changed files with 75 additions and 7 deletions

View file

@ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
4. The captcha font is located in `data/fonts/captcha.ttf` 4. The captcha font is located in `data/fonts/captcha.ttf`
# Cloudflare bypass # Cloudflare bypass (TLS check)
**Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.** **Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**
Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy. Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating.
First, follow these instructions. Only install the Firefox modules: First, follow these instructions. Only install the Firefox modules:

View file

@ -27,18 +27,24 @@ class mwmbl{
curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_URL, $url);
// use http2
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER, curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT, ["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5", "Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip", "Accept-Encoding: gzip",
"Referer: https://beta.mwmbl.org/",
"DNT: 1", "DNT: 1",
"Sec-GPC: 1",
"Connection: keep-alive", "Connection: keep-alive",
"Upgrade-Insecure-Requests: 1", "Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document", "Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate", "Sec-Fetch-Mode: navigate",
"Sec-Fetch-Site: none", "Sec-Fetch-Site: same-origin",
"Priority: u=0, i",
"Sec-Fetch-User: ?1"] "Sec-Fetch-User: ?1"]
); );
@ -46,7 +52,7 @@ class mwmbl{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
$this->backend->assign_proxy($curlproc, $proxy); $this->backend->assign_proxy($curlproc, $proxy);
@ -72,14 +78,14 @@ class mwmbl{
try{ try{
$html = $this->get( $html = $this->get(
$this->backend->get_ip(), // no next page! $this->backend->get_ip(), // no next page!
"https://mwmbl.org/app/home/", "https://beta.mwmbl.org/",
[ [
"q" => $search "q" => $search
] ]
); );
}catch(Exception $error){ }catch(Exception $error){
throw new Exception("Failed to fetch HTML"); throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
} }
$out = [ $out = [
@ -115,6 +121,68 @@ class mwmbl{
$this->fuckhtml $this->fuckhtml
->getElementsByTagName("p"); ->getElementsByTagName("p");
$sublinks = [];
$mores =
$this->fuckhtml
->getElementsByClassName(
"result-link-more",
"div"
);
foreach($mores as $more){
$this->fuckhtml->load($more);
$as =
$this->fuckhtml
->getElementsByClassName(
"more",
"a"
);
if(count($as) === 0){
// ?? invalid
continue;
}
$sublinks[] = [
"title" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"more-title",
"span"
)[0]
)
),
"description" =>
$this->titledots(
$this->fuckhtml
->getTextContent(
$this->fuckhtml
->getElementsByClassName(
"more-extract",
"span"
)[0]
)
),
"url" =>
$this->fuckhtml
->getTextContent(
$as[0]
["attributes"]
["href"]
)
];
}
// reset
$this->fuckhtml->load($result);
$out["web"][] = [ $out["web"][] = [
"title" => "title" =>
$this->titledots( $this->titledots(
@ -153,7 +221,7 @@ class mwmbl{
"url" => null, "url" => null,
"ratio" => null "ratio" => null
], ],
"sublink" => [], "sublink" => $sublinks,
"table" => [] "table" => []
]; ];
} }