fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd

2024-11-14 03:49:48 +01:00 · 2024-08-08 03:29:29 -04:00 · 2024-08-08 03:29:29 -04:00 · fbac3eeb8d
parent 36993013e5
commit fbac3eeb8d
2 changed files with 75 additions and 7 deletions
--- a/docs/configure.md
+++ b/docs/configure.md
@ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel
 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png`
 4. The captcha font is located in `data/fonts/captcha.ttf`
-# Cloudflare bypass
+# Cloudflare bypass (TLS check)
 **Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.**
-Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy.
+Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating.
 First, follow these instructions. Only install the Firefox modules:
--- a/scraper/mwmbl.php
+++ b/scraper/mwmbl.php
@ -27,18 +27,24 @@ class mwmbl{
 		curl_setopt($curlproc, CURLOPT_URL, $url);
 		// use http2
 		curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 			["User-Agent: " . config::USER_AGENT,
 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
 			"Accept-Language: en-US,en;q=0.5",
 			"Accept-Encoding: gzip",
 			"Referer: https://beta.mwmbl.org/",
 			"DNT: 1",
 			"Sec-GPC: 1",
 			"Connection: keep-alive",
 			"Upgrade-Insecure-Requests: 1",
 			"Sec-Fetch-Dest: document",
 			"Sec-Fetch-Mode: navigate",
-			"Sec-Fetch-Site: none",
+			"Sec-Fetch-Site: same-origin",
 			"Priority: u=0, i",
 			"Sec-Fetch-User: ?1"]
 		);
@ -46,7 +52,7 @@ class mwmbl{
 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
-		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
 		$this->backend->assign_proxy($curlproc, $proxy);
@ -72,14 +78,14 @@ class mwmbl{
 		try{
 			$html = $this->get(
 				$this->backend->get_ip(), // no next page!
-				"https://mwmbl.org/app/home/",
+				"https://beta.mwmbl.org/",
 				[
 					"q" => $search
 				]
 			);
 		}catch(Exception $error){
-			throw new Exception("Failed to fetch HTML");
+			throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
 		}
 		$out = [
@ -115,6 +121,68 @@ class mwmbl{
 				$this->fuckhtml
 				->getElementsByTagName("p");
 			$sublinks = [];
 			$mores =
 				$this->fuckhtml
 				->getElementsByClassName(
 					"result-link-more",
 					"div"
 				);
 			foreach($mores as $more){
 				$this->fuckhtml->load($more);
 				$as =
 					$this->fuckhtml
 					->getElementsByClassName(
 						"more",
 						"a"
 					);
 				if(count($as) === 0){
 					// ?? invalid
 					continue;
 				}
 				$sublinks[] = [
 					"title" =>
 						$this->titledots(
 							$this->fuckhtml
 							->getTextContent(
 								$this->fuckhtml
 								->getElementsByClassName(
 									"more-title",
 									"span"
 								)[0]
 							)
 						),
 					"description" =>
 						$this->titledots(
 							$this->fuckhtml
 							->getTextContent(
 								$this->fuckhtml
 								->getElementsByClassName(
 									"more-extract",
 									"span"
 								)[0]
 							)
 						),
 					"url" =>
 						$this->fuckhtml
 						->getTextContent(
 							$as[0]
 							["attributes"]
 							["href"]
 						)
 				];
 			}
 			// reset
 			$this->fuckhtml->load($result);
 			$out["web"][] = [
 				"title" =>
 					$this->titledots(
@ -153,7 +221,7 @@ class mwmbl{
 					"url" => null,
 					"ratio" => null
 				],
-				"sublink" => [],
+				"sublink" => $sublinks,
 				"table" => []
 			];
 		}