mirror of
https://git.lolcat.ca/lolcat/4get.git
synced 2024-11-14 03:49:48 +01:00
fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd
This commit is contained in:
parent
36993013e5
commit
fbac3eeb8d
|
@ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
|
||||||
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
|
3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
|
||||||
4. The captcha font is located in `data/fonts/captcha.ttf`
|
4. The captcha font is located in `data/fonts/captcha.ttf`
|
||||||
|
|
||||||
# Cloudflare bypass
|
# Cloudflare bypass (TLS check)
|
||||||
**Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**
|
**Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**
|
||||||
|
|
||||||
Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy.
|
Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating.
|
||||||
|
|
||||||
First, follow these instructions. Only install the Firefox modules:
|
First, follow these instructions. Only install the Firefox modules:
|
||||||
|
|
||||||
|
|
|
@ -27,18 +27,24 @@ class mwmbl{
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_URL, $url);
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
||||||
|
|
||||||
|
// use http2
|
||||||
|
curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
|
||||||
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
||||||
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
|
||||||
["User-Agent: " . config::USER_AGENT,
|
["User-Agent: " . config::USER_AGENT,
|
||||||
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
"Accept-Language: en-US,en;q=0.5",
|
"Accept-Language: en-US,en;q=0.5",
|
||||||
"Accept-Encoding: gzip",
|
"Accept-Encoding: gzip",
|
||||||
|
"Referer: https://beta.mwmbl.org/",
|
||||||
"DNT: 1",
|
"DNT: 1",
|
||||||
|
"Sec-GPC: 1",
|
||||||
"Connection: keep-alive",
|
"Connection: keep-alive",
|
||||||
"Upgrade-Insecure-Requests: 1",
|
"Upgrade-Insecure-Requests: 1",
|
||||||
"Sec-Fetch-Dest: document",
|
"Sec-Fetch-Dest: document",
|
||||||
"Sec-Fetch-Mode: navigate",
|
"Sec-Fetch-Mode: navigate",
|
||||||
"Sec-Fetch-Site: none",
|
"Sec-Fetch-Site: same-origin",
|
||||||
|
"Priority: u=0, i",
|
||||||
"Sec-Fetch-User: ?1"]
|
"Sec-Fetch-User: ?1"]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -46,7 +52,7 @@ class mwmbl{
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
||||||
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
||||||
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
||||||
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
|
||||||
|
|
||||||
$this->backend->assign_proxy($curlproc, $proxy);
|
$this->backend->assign_proxy($curlproc, $proxy);
|
||||||
|
|
||||||
|
@ -72,14 +78,14 @@ class mwmbl{
|
||||||
try{
|
try{
|
||||||
$html = $this->get(
|
$html = $this->get(
|
||||||
$this->backend->get_ip(), // no next page!
|
$this->backend->get_ip(), // no next page!
|
||||||
"https://mwmbl.org/app/home/",
|
"https://beta.mwmbl.org/",
|
||||||
[
|
[
|
||||||
"q" => $search
|
"q" => $search
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
}catch(Exception $error){
|
}catch(Exception $error){
|
||||||
|
|
||||||
throw new Exception("Failed to fetch HTML");
|
throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
|
||||||
}
|
}
|
||||||
|
|
||||||
$out = [
|
$out = [
|
||||||
|
@ -115,6 +121,68 @@ class mwmbl{
|
||||||
$this->fuckhtml
|
$this->fuckhtml
|
||||||
->getElementsByTagName("p");
|
->getElementsByTagName("p");
|
||||||
|
|
||||||
|
$sublinks = [];
|
||||||
|
|
||||||
|
$mores =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"result-link-more",
|
||||||
|
"div"
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach($mores as $more){
|
||||||
|
|
||||||
|
$this->fuckhtml->load($more);
|
||||||
|
|
||||||
|
$as =
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"more",
|
||||||
|
"a"
|
||||||
|
);
|
||||||
|
|
||||||
|
if(count($as) === 0){
|
||||||
|
|
||||||
|
// ?? invalid
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$sublinks[] = [
|
||||||
|
"title" =>
|
||||||
|
$this->titledots(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"more-title",
|
||||||
|
"span"
|
||||||
|
)[0]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"description" =>
|
||||||
|
$this->titledots(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$this->fuckhtml
|
||||||
|
->getElementsByClassName(
|
||||||
|
"more-extract",
|
||||||
|
"span"
|
||||||
|
)[0]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
"url" =>
|
||||||
|
$this->fuckhtml
|
||||||
|
->getTextContent(
|
||||||
|
$as[0]
|
||||||
|
["attributes"]
|
||||||
|
["href"]
|
||||||
|
)
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset
|
||||||
|
$this->fuckhtml->load($result);
|
||||||
|
|
||||||
$out["web"][] = [
|
$out["web"][] = [
|
||||||
"title" =>
|
"title" =>
|
||||||
$this->titledots(
|
$this->titledots(
|
||||||
|
@ -153,7 +221,7 @@ class mwmbl{
|
||||||
"url" => null,
|
"url" => null,
|
||||||
"ratio" => null
|
"ratio" => null
|
||||||
],
|
],
|
||||||
"sublink" => [],
|
"sublink" => $sublinks,
|
||||||
"table" => []
|
"table" => []
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue