2023-07-22 20:41:14 +02:00
|
|
|
<?php
|
|
|
|
|
2023-11-29 16:31:59 +01:00
|
|
|
// todo:
|
|
|
|
// aliexpress tracking links
|
|
|
|
// enhanced msx notice
|
|
|
|
|
2023-07-22 20:41:14 +02:00
|
|
|
class google{
|
|
|
|
|
|
|
|
private const is_class = ".";
|
|
|
|
private const is_id = "#";
|
|
|
|
|
|
|
|
public function __construct(){
|
|
|
|
|
|
|
|
include "lib/fuckhtml.php";
|
|
|
|
$this->fuckhtml = new fuckhtml();
|
|
|
|
|
2023-11-07 14:04:56 +01:00
|
|
|
include "lib/backend.php";
|
|
|
|
$this->backend = new backend("google");
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
public function getfilters($page){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$base = [
|
|
|
|
"country" => [ // gl=<country>
|
|
|
|
"display" => "Country",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Instance's country",
|
|
|
|
"af" => "Afghanistan",
|
|
|
|
"al" => "Albania",
|
|
|
|
"dz" => "Algeria",
|
|
|
|
"as" => "American Samoa",
|
|
|
|
"ad" => "Andorra",
|
|
|
|
"ao" => "Angola",
|
|
|
|
"ai" => "Anguilla",
|
|
|
|
"aq" => "Antarctica",
|
|
|
|
"ag" => "Antigua and Barbuda",
|
|
|
|
"ar" => "Argentina",
|
|
|
|
"am" => "Armenia",
|
|
|
|
"aw" => "Aruba",
|
|
|
|
"au" => "Australia",
|
|
|
|
"at" => "Austria",
|
|
|
|
"az" => "Azerbaijan",
|
|
|
|
"bs" => "Bahamas",
|
|
|
|
"bh" => "Bahrain",
|
|
|
|
"bd" => "Bangladesh",
|
|
|
|
"bb" => "Barbados",
|
|
|
|
"by" => "Belarus",
|
|
|
|
"be" => "Belgium",
|
|
|
|
"bz" => "Belize",
|
|
|
|
"bj" => "Benin",
|
|
|
|
"bm" => "Bermuda",
|
|
|
|
"bt" => "Bhutan",
|
|
|
|
"bo" => "Bolivia",
|
|
|
|
"ba" => "Bosnia and Herzegovina",
|
|
|
|
"bw" => "Botswana",
|
|
|
|
"bv" => "Bouvet Island",
|
|
|
|
"br" => "Brazil",
|
|
|
|
"io" => "British Indian Ocean Territory",
|
|
|
|
"bn" => "Brunei Darussalam",
|
|
|
|
"bg" => "Bulgaria",
|
|
|
|
"bf" => "Burkina Faso",
|
|
|
|
"bi" => "Burundi",
|
|
|
|
"kh" => "Cambodia",
|
|
|
|
"cm" => "Cameroon",
|
|
|
|
"ca" => "Canada",
|
|
|
|
"cv" => "Cape Verde",
|
|
|
|
"ky" => "Cayman Islands",
|
|
|
|
"cf" => "Central African Republic",
|
|
|
|
"td" => "Chad",
|
|
|
|
"cl" => "Chile",
|
|
|
|
"cn" => "China",
|
|
|
|
"cx" => "Christmas Island",
|
|
|
|
"cc" => "Cocos (Keeling) Islands",
|
|
|
|
"co" => "Colombia",
|
|
|
|
"km" => "Comoros",
|
|
|
|
"cg" => "Congo",
|
|
|
|
"cd" => "Congo, the Democratic Republic",
|
|
|
|
"ck" => "Cook Islands",
|
|
|
|
"cr" => "Costa Rica",
|
|
|
|
"ci" => "Cote D'ivoire",
|
|
|
|
"hr" => "Croatia",
|
|
|
|
"cu" => "Cuba",
|
|
|
|
"cy" => "Cyprus",
|
|
|
|
"cz" => "Czech Republic",
|
|
|
|
"dk" => "Denmark",
|
|
|
|
"dj" => "Djibouti",
|
|
|
|
"dm" => "Dominica",
|
|
|
|
"do" => "Dominican Republic",
|
|
|
|
"ec" => "Ecuador",
|
|
|
|
"eg" => "Egypt",
|
|
|
|
"sv" => "El Salvador",
|
|
|
|
"gq" => "Equatorial Guinea",
|
|
|
|
"er" => "Eritrea",
|
|
|
|
"ee" => "Estonia",
|
|
|
|
"et" => "Ethiopia",
|
|
|
|
"fk" => "Falkland Islands (Malvinas)",
|
|
|
|
"fo" => "Faroe Islands",
|
|
|
|
"fj" => "Fiji",
|
|
|
|
"fi" => "Finland",
|
|
|
|
"fr" => "France",
|
|
|
|
"gf" => "French Guiana",
|
|
|
|
"pf" => "French Polynesia",
|
|
|
|
"tf" => "French Southern Territories",
|
|
|
|
"ga" => "Gabon",
|
|
|
|
"gm" => "Gambia",
|
|
|
|
"ge" => "Georgia",
|
|
|
|
"de" => "Germany",
|
|
|
|
"gh" => "Ghana",
|
|
|
|
"gi" => "Gibraltar",
|
|
|
|
"gr" => "Greece",
|
|
|
|
"gl" => "Greenland",
|
|
|
|
"gd" => "Grenada",
|
|
|
|
"gp" => "Guadeloupe",
|
|
|
|
"gu" => "Guam",
|
|
|
|
"gt" => "Guatemala",
|
|
|
|
"gn" => "Guinea",
|
|
|
|
"gw" => "Guinea-Bissau",
|
|
|
|
"gy" => "Guyana",
|
|
|
|
"ht" => "Haiti",
|
|
|
|
"hm" => "Heard Island and Mcdonald Islands",
|
|
|
|
"va" => "Holy See (Vatican City State)",
|
|
|
|
"hn" => "Honduras",
|
|
|
|
"hk" => "Hong Kong",
|
|
|
|
"hu" => "Hungary",
|
|
|
|
"is" => "Iceland",
|
|
|
|
"in" => "India",
|
|
|
|
"id" => "Indonesia",
|
|
|
|
"ir" => "Iran, Islamic Republic",
|
|
|
|
"iq" => "Iraq",
|
|
|
|
"ie" => "Ireland",
|
|
|
|
"il" => "Israel",
|
|
|
|
"it" => "Italy",
|
|
|
|
"jm" => "Jamaica",
|
|
|
|
"jp" => "Japan",
|
|
|
|
"jo" => "Jordan",
|
|
|
|
"kz" => "Kazakhstan",
|
|
|
|
"ke" => "Kenya",
|
|
|
|
"ki" => "Kiribati",
|
|
|
|
"kp" => "Korea, Democratic People's Republic",
|
|
|
|
"kr" => "Korea, Republic",
|
|
|
|
"kw" => "Kuwait",
|
|
|
|
"kg" => "Kyrgyzstan",
|
|
|
|
"la" => "Lao People's Democratic Republic",
|
|
|
|
"lv" => "Latvia",
|
|
|
|
"lb" => "Lebanon",
|
|
|
|
"ls" => "Lesotho",
|
|
|
|
"lr" => "Liberia",
|
|
|
|
"ly" => "Libyan Arab Jamahiriya",
|
|
|
|
"li" => "Liechtenstein",
|
|
|
|
"lt" => "Lithuania",
|
|
|
|
"lu" => "Luxembourg",
|
|
|
|
"mo" => "Macao",
|
|
|
|
"mk" => "Macedonia, the Former Yugosalv Republic",
|
|
|
|
"mg" => "Madagascar",
|
|
|
|
"mw" => "Malawi",
|
|
|
|
"my" => "Malaysia",
|
|
|
|
"mv" => "Maldives",
|
|
|
|
"ml" => "Mali",
|
|
|
|
"mt" => "Malta",
|
|
|
|
"mh" => "Marshall Islands",
|
|
|
|
"mq" => "Martinique",
|
|
|
|
"mr" => "Mauritania",
|
|
|
|
"mu" => "Mauritius",
|
|
|
|
"yt" => "Mayotte",
|
|
|
|
"mx" => "Mexico",
|
|
|
|
"fm" => "Micronesia, Federated States",
|
|
|
|
"md" => "Moldova, Republic",
|
|
|
|
"mc" => "Monaco",
|
|
|
|
"mn" => "Mongolia",
|
|
|
|
"ms" => "Montserrat",
|
|
|
|
"ma" => "Morocco",
|
|
|
|
"mz" => "Mozambique",
|
|
|
|
"mm" => "Myanmar",
|
|
|
|
"na" => "Namibia",
|
|
|
|
"nr" => "Nauru",
|
|
|
|
"np" => "Nepal",
|
|
|
|
"nl" => "Netherlands",
|
|
|
|
"an" => "Netherlands Antilles",
|
|
|
|
"nc" => "New Caledonia",
|
|
|
|
"nz" => "New Zealand",
|
|
|
|
"ni" => "Nicaragua",
|
|
|
|
"ne" => "Niger",
|
|
|
|
"ng" => "Nigeria",
|
|
|
|
"nu" => "Niue",
|
|
|
|
"nf" => "Norfolk Island",
|
|
|
|
"mp" => "Northern Mariana Islands",
|
|
|
|
"no" => "Norway",
|
|
|
|
"om" => "Oman",
|
|
|
|
"pk" => "Pakistan",
|
|
|
|
"pw" => "Palau",
|
|
|
|
"ps" => "Palestinian Territory, Occupied",
|
|
|
|
"pa" => "Panama",
|
|
|
|
"pg" => "Papua New Guinea",
|
|
|
|
"py" => "Paraguay",
|
|
|
|
"pe" => "Peru",
|
|
|
|
"ph" => "Philippines",
|
|
|
|
"pn" => "Pitcairn",
|
|
|
|
"pl" => "Poland",
|
|
|
|
"pt" => "Portugal",
|
|
|
|
"pr" => "Puerto Rico",
|
|
|
|
"qa" => "Qatar",
|
|
|
|
"re" => "Reunion",
|
|
|
|
"ro" => "Romania",
|
|
|
|
"ru" => "Russian Federation",
|
|
|
|
"rw" => "Rwanda",
|
|
|
|
"sh" => "Saint Helena",
|
|
|
|
"kn" => "Saint Kitts and Nevis",
|
|
|
|
"lc" => "Saint Lucia",
|
|
|
|
"pm" => "Saint Pierre and Miquelon",
|
|
|
|
"vc" => "Saint Vincent and the Grenadines",
|
|
|
|
"ws" => "Samoa",
|
|
|
|
"sm" => "San Marino",
|
|
|
|
"st" => "Sao Tome and Principe",
|
|
|
|
"sa" => "Saudi Arabia",
|
|
|
|
"sn" => "Senegal",
|
|
|
|
"cs" => "Serbia and Montenegro",
|
|
|
|
"sc" => "Seychelles",
|
|
|
|
"sl" => "Sierra Leone",
|
|
|
|
"sg" => "Singapore",
|
|
|
|
"sk" => "Slovakia",
|
|
|
|
"si" => "Slovenia",
|
|
|
|
"sb" => "Solomon Islands",
|
|
|
|
"so" => "Somalia",
|
|
|
|
"za" => "South Africa",
|
|
|
|
"gs" => "South Georgia and the South Sandwich Islands",
|
|
|
|
"es" => "Spain",
|
|
|
|
"lk" => "Sri Lanka",
|
|
|
|
"sd" => "Sudan",
|
|
|
|
"sr" => "Suriname",
|
|
|
|
"sj" => "Svalbard and Jan Mayen",
|
|
|
|
"sz" => "Swaziland",
|
|
|
|
"se" => "Sweden",
|
|
|
|
"ch" => "Switzerland",
|
|
|
|
"sy" => "Syrian Arab Republic",
|
|
|
|
"tw" => "Taiwan, Province of China",
|
|
|
|
"tj" => "Tajikistan",
|
|
|
|
"tz" => "Tanzania, United Republic",
|
|
|
|
"th" => "Thailand",
|
|
|
|
"tl" => "Timor-Leste",
|
|
|
|
"tg" => "Togo",
|
|
|
|
"tk" => "Tokelau",
|
|
|
|
"to" => "Tonga",
|
|
|
|
"tt" => "Trinidad and Tobago",
|
|
|
|
"tn" => "Tunisia",
|
|
|
|
"tr" => "Turkey",
|
|
|
|
"tm" => "Turkmenistan",
|
|
|
|
"tc" => "Turks and Caicos Islands",
|
|
|
|
"tv" => "Tuvalu",
|
|
|
|
"ug" => "Uganda",
|
|
|
|
"ua" => "Ukraine",
|
|
|
|
"ae" => "United Arab Emirates",
|
|
|
|
"uk" => "United Kingdom",
|
|
|
|
"us" => "United States",
|
|
|
|
"um" => "United States Minor Outlying Islands",
|
|
|
|
"uy" => "Uruguay",
|
|
|
|
"uz" => "Uzbekistan",
|
|
|
|
"vu" => "Vanuatu",
|
|
|
|
"ve" => "Venezuela",
|
|
|
|
"vn" => "Viet Nam",
|
|
|
|
"vg" => "Virgin Islands, British",
|
|
|
|
"vi" => "Virgin Islands, U.S.",
|
|
|
|
"wf" => "Wallis and Futuna",
|
|
|
|
"eh" => "Western Sahara",
|
|
|
|
"ye" => "Yemen",
|
|
|
|
"zm" => "Zambia",
|
|
|
|
"zw" => "Zimbabwe"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"nsfw" => [
|
|
|
|
"display" => "NSFW",
|
|
|
|
"option" => [
|
|
|
|
"yes" => "Yes", // safe=active
|
|
|
|
"no" => "No" // safe=off
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"lang" => [ // lr=<lang> (prefix lang with "lang_")
|
|
|
|
"display" => "Language",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any language",
|
|
|
|
"ar" => "Arabic",
|
|
|
|
"bg" => "Bulgarian",
|
|
|
|
"ca" => "Catalan",
|
|
|
|
"cs" => "Czech",
|
|
|
|
"da" => "Danish",
|
|
|
|
"de" => "German",
|
|
|
|
"el" => "Greek",
|
|
|
|
"en" => "English",
|
|
|
|
"es" => "Spanish",
|
|
|
|
"et" => "Estonian",
|
|
|
|
"fi" => "Finnish",
|
|
|
|
"fr" => "French",
|
|
|
|
"hr" => "Croatian",
|
|
|
|
"hu" => "Hungarian",
|
|
|
|
"id" => "Indonesian",
|
|
|
|
"is" => "Icelandic",
|
|
|
|
"it" => "Italian",
|
|
|
|
"iw" => "Hebrew",
|
|
|
|
"ja" => "Japanese",
|
|
|
|
"ko" => "Korean",
|
|
|
|
"lt" => "Lithuanian",
|
|
|
|
"lv" => "Latvian",
|
|
|
|
"nl" => "Dutch",
|
|
|
|
"no" => "Norwegian",
|
|
|
|
"pl" => "Polish",
|
|
|
|
"pt" => "Portuguese",
|
|
|
|
"ro" => "Romanian",
|
|
|
|
"ru" => "Russian",
|
|
|
|
"sk" => "Slovak",
|
|
|
|
"sl" => "Slovenian",
|
|
|
|
"sr" => "Serbian",
|
|
|
|
"sv" => "Swedish",
|
|
|
|
"tr" => "Turkish",
|
|
|
|
"zh-CN" => "Chinese (Simplified)",
|
|
|
|
"zh-TW" => "Chinese (Traditional)"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
];
|
|
|
|
|
2023-07-22 20:41:14 +02:00
|
|
|
switch($page){
|
|
|
|
|
2023-07-27 01:03:06 +02:00
|
|
|
case "web":
|
2023-11-27 07:01:56 +01:00
|
|
|
return array_merge(
|
|
|
|
$base,
|
|
|
|
[
|
|
|
|
"newer" => [ // &sort=review-date:r:20090301:20090430
|
|
|
|
"display" => "Newer than",
|
|
|
|
"option" => "_DATE"
|
|
|
|
],
|
|
|
|
"older" => [
|
|
|
|
"display" => "Older than",
|
|
|
|
"option" => "_DATE"
|
2023-07-22 20:41:14 +02:00
|
|
|
]
|
|
|
|
]
|
2023-11-27 07:01:56 +01:00
|
|
|
);
|
2023-07-22 20:41:14 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
case "images":
|
2023-11-27 07:01:56 +01:00
|
|
|
return array_merge(
|
|
|
|
$base,
|
|
|
|
[
|
|
|
|
"time" => [ // tbs=qrd:<size>
|
|
|
|
"display" => "Time posted",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any time",
|
|
|
|
"d" => "Past 24 hours",
|
|
|
|
"w" => "Past week",
|
|
|
|
"m" => "Past month",
|
|
|
|
"y" => "Past year"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"size" => [
|
|
|
|
"display" => "Size",
|
|
|
|
"option" => [
|
|
|
|
// tbs=isz:<size>
|
|
|
|
"any" => "Any size",
|
|
|
|
"l" => "Large",
|
|
|
|
"m" => "Medium",
|
|
|
|
"i" => "Icon",
|
|
|
|
// from here
|
|
|
|
// tbz:lt,islt:<size>
|
|
|
|
"qsvga" => "Larger than 400x300",
|
|
|
|
"vga" => "Larger than 640x480",
|
|
|
|
"qsvga" => "Larger than 800x600",
|
|
|
|
"xga" => "Larger than 1024x768",
|
|
|
|
"2mp" => "Larger than 2MP",
|
|
|
|
"4mp" => "Larger than 4MP",
|
|
|
|
"6mp" => "Larger than 6MP",
|
|
|
|
"8mp" => "Larger than 8MP",
|
|
|
|
"10mp" => "Larger than 10MP",
|
|
|
|
"12mp" => "Larger than 12MP",
|
|
|
|
"15mp" => "Larger than 15MP",
|
|
|
|
"20mp" => "Larger than 20MP",
|
|
|
|
"40mp" => "Larger than 40MP",
|
|
|
|
"70mp" => "Larger than 70MP"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"ratio" => [ // tbs=iar:<size>
|
|
|
|
"display" => "Aspect ratio",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any ratio",
|
|
|
|
"t" => "Tall",
|
|
|
|
"s" => "Square",
|
|
|
|
"w" => "Wide",
|
|
|
|
"xw" => "Panoramic"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"color" => [ // tbs=ic:<color>
|
|
|
|
"display" => "Color",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any color",
|
|
|
|
"color" => "Full color",
|
|
|
|
"gray" => "Black & white",
|
|
|
|
"trans" => "Transparent",
|
|
|
|
// from there, its ic:specific,isc:<color>
|
|
|
|
"red" => "Red",
|
|
|
|
"orange" => "Orange",
|
|
|
|
"yellow" => "Yellow",
|
|
|
|
"green" => "Green",
|
|
|
|
"teal" => "Teal",
|
|
|
|
"blue" => "Blue",
|
|
|
|
"purple" => "Purple",
|
|
|
|
"pink" => "Pink",
|
|
|
|
"white" => "White",
|
|
|
|
"gray" => "Gray",
|
|
|
|
"black" => "Black",
|
|
|
|
"brown" => "Brown"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"type" => [ // tbs=itp:<type>
|
|
|
|
"display" => "Type",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any type",
|
|
|
|
"face" => "Faces",
|
|
|
|
"clipart" => "Clip Art",
|
|
|
|
"lineart" => "Line Drawing",
|
|
|
|
"stock" => "Stock",
|
|
|
|
"animated" => "Animated"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"format" => [ // tbs=ift:<format>
|
|
|
|
"display" => "Format",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any format",
|
|
|
|
"jpg" => "JPG",
|
|
|
|
"gif" => "GIF",
|
|
|
|
"png" => "PNG",
|
|
|
|
"bmp" => "BMP",
|
|
|
|
"svg" => "SVG",
|
|
|
|
"webp" => "WEBP",
|
|
|
|
"ico" => "ICO",
|
|
|
|
"craw" => "RAW"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"rights" => [ // tbs=il:<rights>
|
|
|
|
"display" => "Usage rights",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any license",
|
|
|
|
"cl" => "Creative Commons licenses",
|
|
|
|
"ol" => "Commercial & other licenses"
|
|
|
|
]
|
2023-07-22 20:41:14 +02:00
|
|
|
]
|
2023-11-27 07:01:56 +01:00
|
|
|
]
|
|
|
|
);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "videos":
|
|
|
|
return array_merge(
|
|
|
|
$base,
|
|
|
|
[
|
|
|
|
"time" => [
|
|
|
|
"display" => "Time posted",
|
|
|
|
"option" => [ // tbs=qdr
|
|
|
|
"any" => "Any time",
|
|
|
|
"h" => "Past hour",
|
|
|
|
"d" => "Past 24 hours",
|
|
|
|
"w" => "Past week",
|
|
|
|
"m" => "Past month",
|
|
|
|
"y" => "Past year"
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"duration" => [
|
|
|
|
"display" => "Duration",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any duration",
|
|
|
|
"s" => "Short (0-4min)", // tbs=dur:s
|
|
|
|
"m" => "Medium (4-20min)", // tbs=dur:m
|
|
|
|
"l" => "Long (20+ min)" // tbs=dur:l
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"quality" => [
|
|
|
|
"display" => "Quality",
|
|
|
|
"option" => [
|
|
|
|
"any" => "Any quality",
|
|
|
|
"h" => "High quality" // tbs=hq:h
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"captions" => [
|
|
|
|
"display" => "Captions",
|
|
|
|
"option" => [
|
|
|
|
"any" => "No preference",
|
|
|
|
"yes" => "Closed captioned" // tbs=cc:1
|
|
|
|
]
|
2023-08-14 05:35:08 +02:00
|
|
|
]
|
2023-11-27 07:01:56 +01:00
|
|
|
]
|
|
|
|
);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "news":
|
|
|
|
return array_merge(
|
|
|
|
$base,
|
|
|
|
[
|
|
|
|
"time" => [
|
|
|
|
"display" => "Time posted",
|
|
|
|
"option" => [ // tbs=qdr
|
|
|
|
"any" => "Any time",
|
|
|
|
"h" => "Past hour",
|
|
|
|
"d" => "Past 24 hours",
|
|
|
|
"w" => "Past week",
|
|
|
|
"m" => "Past month",
|
|
|
|
"y" => "Past year",
|
|
|
|
"a" => "Archives" // tbs=ar:1
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"sort" => [
|
|
|
|
"display" => "Sort",
|
|
|
|
"option" => [
|
|
|
|
"relevance" => "Relevance",
|
|
|
|
"date" => "Date" // sbd:1
|
|
|
|
]
|
2023-07-22 20:41:14 +02:00
|
|
|
]
|
|
|
|
]
|
2023-11-27 07:01:56 +01:00
|
|
|
);
|
2023-07-22 20:41:14 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-07 14:04:56 +01:00
|
|
|
private function get($proxy, $url, $get = []){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
|
|
|
$headers = [
|
|
|
|
"User-Agent: Mozilla/5.0 (Linux; U; Android 2.3.3; pt-pt; LG-P500h-parrot Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MMS/LG-Android-MMS-V1.0/1.2",
|
|
|
|
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
|
|
"Accept-Language: en-US,en;q=0.5",
|
|
|
|
"Accept-Encoding: gzip",
|
|
|
|
"DNT: 1",
|
2024-03-20 15:59:51 +01:00
|
|
|
"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY",
|
2023-07-22 20:41:14 +02:00
|
|
|
"Connection: keep-alive",
|
|
|
|
"Upgrade-Insecure-Requests: 1",
|
|
|
|
"Sec-Fetch-Dest: document",
|
|
|
|
"Sec-Fetch-Mode: navigate",
|
|
|
|
"Sec-Fetch-Site: none",
|
|
|
|
"Sec-Fetch-User: ?1"
|
|
|
|
];
|
|
|
|
|
|
|
|
$curlproc = curl_init();
|
|
|
|
|
|
|
|
if($get !== []){
|
|
|
|
$get = http_build_query($get);
|
|
|
|
$url .= "?" . $get;
|
|
|
|
}
|
|
|
|
|
2024-02-26 17:31:52 +01:00
|
|
|
curl_setopt($curlproc, CURLOPT_URL, $url);
|
2023-07-22 20:41:14 +02:00
|
|
|
|
|
|
|
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
|
|
|
|
curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers);
|
|
|
|
|
|
|
|
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
|
|
|
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
|
|
|
|
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
|
|
|
|
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
|
|
|
|
curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
|
2023-11-07 14:04:56 +01:00
|
|
|
|
|
|
|
$this->backend->assign_proxy($curlproc, $proxy);
|
2023-07-22 20:41:14 +02:00
|
|
|
|
|
|
|
$data = curl_exec($curlproc);
|
|
|
|
|
|
|
|
if(curl_errno($curlproc)){
|
|
|
|
|
|
|
|
throw new Exception(curl_error($curlproc));
|
|
|
|
}
|
|
|
|
|
|
|
|
curl_close($curlproc);
|
|
|
|
return $data;
|
|
|
|
}
|
2023-11-27 07:01:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
2023-07-22 20:41:14 +02:00
|
|
|
public function web($get){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($get["npt"]){
|
|
|
|
|
|
|
|
[$req, $ip] = $this->backend->get($get["npt"], "web");
|
|
|
|
parse_str(
|
|
|
|
parse_url($req, PHP_URL_QUERY),
|
|
|
|
$search
|
|
|
|
);
|
|
|
|
|
|
|
|
if(isset($search["q"])){
|
|
|
|
|
|
|
|
$search = $search["q"];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$search = "a"; // lol
|
|
|
|
}
|
|
|
|
|
|
|
|
try{
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$ip,
|
|
|
|
"https://www.google.com" . $req,
|
|
|
|
[]
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
$search = $get["s"];
|
|
|
|
$country = $get["country"];
|
|
|
|
$nsfw = $get["nsfw"];
|
|
|
|
$lang = $get["lang"];
|
|
|
|
$older = $get["older"];
|
|
|
|
$newer = $get["newer"];
|
|
|
|
$ip = $this->backend->get_ip();
|
|
|
|
|
|
|
|
$params = [
|
|
|
|
"q" => $search,
|
2023-11-29 05:51:25 +01:00
|
|
|
"hl" => "en",
|
2023-11-27 07:01:56 +01:00
|
|
|
"num" => 20 // get 20 results
|
|
|
|
];
|
|
|
|
|
|
|
|
// country
|
|
|
|
if($country != "any"){
|
|
|
|
|
|
|
|
$params["gl"] = $country;
|
|
|
|
}
|
|
|
|
|
|
|
|
// nsfw
|
|
|
|
$params["safe"] = $nsfw == "yes" ? "off" : "active";
|
|
|
|
|
|
|
|
// language
|
|
|
|
if($lang != "any"){
|
|
|
|
|
|
|
|
$params["lr"] = "lang_" . $lang;
|
|
|
|
}
|
|
|
|
|
|
|
|
// &sort=review-date:r:20090301:20090430
|
|
|
|
$older = $older === false ? false : date("Ymd", $older);
|
|
|
|
$newer = $newer === false ? false : date("Ymd", $newer);
|
|
|
|
|
|
|
|
if(
|
|
|
|
$older !== false &&
|
|
|
|
$newer === false
|
|
|
|
){
|
|
|
|
|
|
|
|
$newer = date("Ymd", time());
|
|
|
|
}
|
|
|
|
|
|
|
|
if(
|
|
|
|
$older !== false ||
|
|
|
|
$newer !== false
|
|
|
|
){
|
|
|
|
|
|
|
|
$params["sort"] = "review-date:r:" . $older . ":" . $newer;
|
|
|
|
}
|
|
|
|
|
|
|
|
try{
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$ip,
|
|
|
|
"https://www.google.com/search",
|
|
|
|
$params
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
2024-03-25 03:31:19 +01:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
//$html = file_get_contents("scraper/google.html");
|
2023-11-27 07:01:56 +01:00
|
|
|
}
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
return $this->parsepage($html, "web", $search, $ip);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public function video($get){
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($get["npt"]){
|
|
|
|
|
|
|
|
[$req, $ip] = $this->backend->get($get["npt"], "videos");
|
|
|
|
parse_str(
|
|
|
|
parse_url($req, PHP_URL_QUERY),
|
|
|
|
$search
|
|
|
|
);
|
|
|
|
|
|
|
|
if(isset($search["q"])){
|
|
|
|
|
|
|
|
$search = $search["q"];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$search = "a"; // lol
|
|
|
|
}
|
|
|
|
|
|
|
|
try{
|
|
|
|
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$ip,
|
|
|
|
"https://www.google.com" . $req,
|
|
|
|
[]
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
|
|
|
|
|
|
|
}else{
|
|
|
|
$search = $get["s"];
|
|
|
|
$country = $get["country"];
|
|
|
|
$nsfw = $get["nsfw"];
|
|
|
|
$lang = $get["lang"];
|
|
|
|
$time = $get["time"];
|
|
|
|
$duration = $get["duration"];
|
|
|
|
$quality = $get["quality"];
|
|
|
|
$captions = $get["captions"];
|
|
|
|
$ip = $this->backend->get_ip();
|
|
|
|
|
|
|
|
$params = [
|
|
|
|
"q" => $search,
|
|
|
|
"tbm" => "vid",
|
2023-11-29 05:51:25 +01:00
|
|
|
"hl" => "en",
|
2023-11-27 07:01:56 +01:00
|
|
|
"num" => "20"
|
|
|
|
];
|
|
|
|
|
|
|
|
// country
|
|
|
|
if($country != "any"){
|
|
|
|
|
|
|
|
$params["gl"] = $country;
|
|
|
|
}
|
|
|
|
|
|
|
|
// nsfw
|
|
|
|
$params["safe"] = $nsfw == "yes" ? "off" : "active";
|
|
|
|
|
|
|
|
// language
|
|
|
|
if($lang != "any"){
|
|
|
|
|
|
|
|
$params["lr"] = "lang_" . $lang;
|
|
|
|
}
|
|
|
|
|
|
|
|
$tbs = [];
|
|
|
|
|
|
|
|
// time
|
|
|
|
if($time != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "qdr:" . $time;
|
|
|
|
}
|
|
|
|
|
|
|
|
// duration
|
|
|
|
if($duration != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "dur:" . $duration;
|
|
|
|
}
|
|
|
|
|
|
|
|
// quality
|
|
|
|
if($quality != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "hq:" . $quality;
|
|
|
|
}
|
|
|
|
|
|
|
|
// captions
|
|
|
|
if($captions != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "cc:" . $captions;
|
|
|
|
}
|
|
|
|
|
|
|
|
// append tbs
|
|
|
|
if(count($tbs) !== 0){
|
|
|
|
|
|
|
|
$params["tbs"] =
|
|
|
|
implode(",", $tbs);
|
|
|
|
}
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
try{
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$ip,
|
|
|
|
"https://www.google.com/search",
|
|
|
|
$params
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$json = $this->parsepage($html, "videos", $search, $ip);
|
|
|
|
$out = [
|
|
|
|
"status" => "ok",
|
|
|
|
"npt" => $json["npt"],
|
|
|
|
"video" => [],
|
|
|
|
"author" => [],
|
|
|
|
"livestream" => [],
|
|
|
|
"playlist" => [],
|
|
|
|
"reel" => []
|
|
|
|
];
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
foreach($json["web"] as $item){
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$out["video"][] = [
|
|
|
|
"title" => $item["title"],
|
|
|
|
"description" => $item["description"],
|
|
|
|
"author" => [
|
|
|
|
"name" => null,
|
|
|
|
"url" => null,
|
|
|
|
"avatar" => null
|
|
|
|
],
|
|
|
|
"date" => isset($item["table"]["Posted"]) ? strtotime($item["table"]["Posted"]) : null,
|
|
|
|
"duration" => isset($item["table"]["Duration"]) ? $this->hms2int($item["table"]["Duration"]) : null,
|
|
|
|
"views" => null,
|
|
|
|
"thumb" =>
|
|
|
|
$item["thumb"]["url"] === null ?
|
|
|
|
[
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
] :
|
|
|
|
[
|
|
|
|
"url" => $item["thumb"]["url"],
|
|
|
|
"ratio" => "16:9"
|
|
|
|
],
|
|
|
|
"url" => $item["url"]
|
|
|
|
];
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public function news($get){
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($get["npt"]){
|
|
|
|
|
|
|
|
[$req, $ip] = $this->backend->get($get["npt"], "news");
|
|
|
|
parse_str(
|
|
|
|
parse_url($req, PHP_URL_QUERY),
|
|
|
|
$search
|
|
|
|
);
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(isset($search["q"])){
|
|
|
|
|
|
|
|
$search = $search["q"];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$search = "a"; // lol
|
|
|
|
}
|
|
|
|
|
|
|
|
try{
|
|
|
|
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$ip,
|
|
|
|
"https://www.google.com" . $req,
|
|
|
|
[]
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get HTML");
|
|
|
|
}
|
|
|
|
|
|
|
|
}else{
|
|
|
|
$search = $get["s"];
|
|
|
|
$country = $get["country"];
|
|
|
|
$nsfw = $get["nsfw"];
|
|
|
|
$lang = $get["lang"];
|
|
|
|
$time = $get["time"];
|
|
|
|
$sort = $get["sort"];
|
|
|
|
$ip = $this->backend->get_ip();
|
|
|
|
|
|
|
|
$params = [
|
|
|
|
"q" => $search,
|
|
|
|
"tbm" => "nws",
|
2023-11-29 05:51:25 +01:00
|
|
|
"hl" => "en",
|
2023-11-27 07:01:56 +01:00
|
|
|
"num" => "20"
|
|
|
|
];
|
|
|
|
|
|
|
|
// country
|
|
|
|
if($country != "any"){
|
|
|
|
|
|
|
|
$params["gl"] = $country;
|
|
|
|
}
|
|
|
|
|
|
|
|
// nsfw
|
|
|
|
$params["safe"] = $nsfw == "yes" ? "off" : "active";
|
|
|
|
|
|
|
|
// language
|
|
|
|
if($lang != "any"){
|
|
|
|
|
|
|
|
$params["lr"] = "lang_" . $lang;
|
|
|
|
}
|
|
|
|
|
|
|
|
$tbs = [];
|
|
|
|
|
|
|
|
// time
|
|
|
|
if($time != "any"){
|
|
|
|
|
|
|
|
if($time == "a"){
|
|
|
|
|
|
|
|
$tbs[] = "ar:1";
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$tbs[] = "qdr:" . $time;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// relevance
|
|
|
|
if($sort == "date"){
|
|
|
|
|
|
|
|
$tbs[] = "sbd:1";
|
|
|
|
}
|
|
|
|
|
|
|
|
// append tbs
|
|
|
|
if(count($tbs) !== 0){
|
|
|
|
|
|
|
|
$params["tbs"] =
|
|
|
|
implode(",", $tbs);
|
|
|
|
}
|
|
|
|
|
|
|
|
$html =
|
|
|
|
$this->get(
|
|
|
|
$ip,
|
|
|
|
"https://www.google.com/search",
|
|
|
|
$params
|
|
|
|
);
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$json = $this->parsepage($html, "news", $search, $ip);
|
|
|
|
$out = [
|
|
|
|
"status" => "ok",
|
|
|
|
"npt" => $json["npt"],
|
|
|
|
"news" => []
|
|
|
|
];
|
|
|
|
|
|
|
|
foreach($json["web"] as $item){
|
|
|
|
|
|
|
|
$description = array_key_first($item["table"]);
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($description !== null){
|
|
|
|
|
|
|
|
$date = $item["table"][$description];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$date = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$out["news"][] = [
|
|
|
|
"title" => $item["title"],
|
|
|
|
"author" => $item["author"],
|
|
|
|
"description" => $description,
|
|
|
|
"date" => strtotime($date),
|
|
|
|
"thumb" =>
|
|
|
|
$item["thumb"]["url"] === null ?
|
|
|
|
[
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
] :
|
|
|
|
[
|
|
|
|
"url" => $item["thumb"]["url"],
|
|
|
|
"ratio" => "16:9"
|
|
|
|
],
|
|
|
|
"url" => $item["url"]
|
|
|
|
];
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
private function parsepage($html, $pagetype, $search, $ip){
|
|
|
|
/*
|
2023-07-22 20:41:14 +02:00
|
|
|
$handle = fopen("scraper/google.html", "r");
|
|
|
|
$html = fread($handle, filesize("scraper/google.html"));
|
|
|
|
fclose($handle);
|
2023-11-27 07:01:56 +01:00
|
|
|
*/
|
2023-07-22 20:41:14 +02:00
|
|
|
|
|
|
|
$out = [
|
|
|
|
"status" => "ok",
|
|
|
|
"spelling" => [
|
|
|
|
"type" => "no_correction",
|
|
|
|
"using" => null,
|
|
|
|
"correction" => null
|
|
|
|
],
|
|
|
|
"npt" => null,
|
|
|
|
"answer" => [],
|
|
|
|
"web" => [],
|
|
|
|
"image" => [],
|
|
|
|
"video" => [],
|
|
|
|
"news" => [],
|
|
|
|
"related" => []
|
|
|
|
];
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2024-03-20 15:59:51 +01:00
|
|
|
if($error = $this->detect_sorry($html)){
|
2024-02-25 15:51:18 +01:00
|
|
|
|
2024-03-20 15:59:51 +01:00
|
|
|
throw new Exception($error);
|
2024-02-25 15:51:18 +01:00
|
|
|
}
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->parsejavascript($html);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
//
|
|
|
|
// parse accdef's
|
|
|
|
//
|
|
|
|
$has_appended_accdef = false;
|
|
|
|
|
|
|
|
preg_match_all(
|
|
|
|
'/window\.jsl\.dh\(\'(accdef_[0-9]+)\',\'(.*)\'\);/',
|
|
|
|
$html,
|
|
|
|
$accdefs_regex
|
|
|
|
);
|
|
|
|
|
|
|
|
$accdefs = [];
|
|
|
|
for($i=0; $i<count($accdefs_regex[0]); $i++){
|
|
|
|
|
|
|
|
// decode UTF-16 string
|
|
|
|
$answer =
|
|
|
|
$this->fuckhtml
|
|
|
|
->parseJsString(
|
|
|
|
$accdefs_regex[2][$i]
|
|
|
|
);
|
|
|
|
|
|
|
|
$this->fuckhtml->load($answer);
|
|
|
|
|
|
|
|
// get description
|
|
|
|
$description =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"padding" => "12px 16px 12px",
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
)[1];
|
|
|
|
|
|
|
|
// get date (rare)
|
|
|
|
$date =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("sub");
|
|
|
|
|
|
|
|
if(count($date) !== 0){
|
|
|
|
|
|
|
|
$description =
|
|
|
|
str_replace(
|
|
|
|
$date[0]["outerHTML"],
|
|
|
|
"",
|
|
|
|
$description["innerHTML"]
|
|
|
|
);
|
|
|
|
|
|
|
|
$date =
|
|
|
|
strtotime(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$date[0]
|
|
|
|
)
|
|
|
|
);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$date = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// get information table
|
|
|
|
$table = [];
|
|
|
|
|
|
|
|
$tbody =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("tbody");
|
|
|
|
|
|
|
|
if(count($tbody) !== 0){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($tbody[0]);
|
|
|
|
|
|
|
|
$trs =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("tr");
|
|
|
|
|
|
|
|
foreach($trs as $tr){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($tr);
|
|
|
|
|
|
|
|
$tds =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("td");
|
|
|
|
|
|
|
|
if(count($tds) === 2){
|
|
|
|
|
|
|
|
$table[
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$tds[0]
|
|
|
|
)
|
|
|
|
] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$tds[1]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// load back what we had
|
|
|
|
$this->fuckhtml->load($answer);
|
|
|
|
}
|
|
|
|
|
|
|
|
// get title & link
|
|
|
|
$a =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a")[0];
|
|
|
|
|
|
|
|
$this->fuckhtml->load($a);
|
|
|
|
|
|
|
|
$title =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("span");
|
|
|
|
|
|
|
|
if(count($title) === 0){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$accdefs[] = [
|
|
|
|
"title" =>
|
|
|
|
$this->titledots(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$title[0]
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"description" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$description
|
|
|
|
),
|
|
|
|
"url" =>
|
|
|
|
$this->unshiturl(
|
|
|
|
$a["attributes"]["href"]
|
|
|
|
),
|
|
|
|
"date" => $date,
|
|
|
|
"type" => "web",
|
|
|
|
"thumb" => [
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
],
|
|
|
|
"sublink" => [],
|
|
|
|
"table" => $table
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->fuckhtml->load($html);
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
$containers =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"background-color" => "#fff",
|
|
|
|
"margin-bottom" => "10px",
|
|
|
|
"-webkit-box-shadow" => "0 1px 6px rgba(32,33,36,0.28)",
|
|
|
|
"border-radius" => "8px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($containers as $container){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($container);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// detect spelling
|
|
|
|
$spelling =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"font-size" => "20px",
|
|
|
|
"line-height" => "26px",
|
|
|
|
"padding-top" => "2px",
|
|
|
|
"margin-bottom" => "1px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($spelling) !== 0){
|
|
|
|
|
|
|
|
$a =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
if(count($a) !== 0){
|
|
|
|
|
|
|
|
$scripts =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("script");
|
|
|
|
|
|
|
|
foreach($scripts as $script){
|
|
|
|
|
|
|
|
$container["innerHTML"] =
|
|
|
|
str_replace(
|
|
|
|
$script["outerHTML"],
|
|
|
|
"",
|
|
|
|
$container["innerHTML"]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
$container["innerHTML"] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
str_replace(
|
|
|
|
$a[0]["outerHTML"],
|
|
|
|
"",
|
|
|
|
$container["innerHTML"]
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
if(
|
|
|
|
preg_match(
|
|
|
|
'/^did you mean/i',
|
|
|
|
$container["innerHTML"]
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
$out["spelling"] = [
|
|
|
|
"type" => "not_many",
|
|
|
|
"using" => $search,
|
|
|
|
"correction" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$a[0]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
elseif(
|
|
|
|
preg_match(
|
|
|
|
'/^showing results for/i',
|
|
|
|
$container["innerHTML"]
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
$out["spelling"] = [
|
|
|
|
"type" => "including",
|
|
|
|
"using" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$a[0]
|
|
|
|
),
|
|
|
|
"correction" => $search
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
$title =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"color" => "#1967d2",
|
|
|
|
"font-size" => "20px",
|
|
|
|
"line-height" => "26px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($title) !== 0){
|
|
|
|
|
2023-11-07 14:04:56 +01:00
|
|
|
//
|
|
|
|
// Container is a web link
|
|
|
|
//
|
2023-08-27 07:45:59 +02:00
|
|
|
$web = [
|
|
|
|
"title" =>
|
|
|
|
$this->titledots(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$title[0]
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"description" => null,
|
2023-11-27 07:01:56 +01:00
|
|
|
"url" => null,
|
2023-08-27 07:45:59 +02:00
|
|
|
"date" => null,
|
|
|
|
"type" => "web",
|
|
|
|
"thumb" => [
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
],
|
|
|
|
"sublink" => [],
|
|
|
|
"table" => []
|
|
|
|
];
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// get link
|
|
|
|
$web["url"] =
|
|
|
|
$this->unshiturl(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a")
|
|
|
|
[0]
|
|
|
|
["attributes"]
|
|
|
|
["href"]
|
|
|
|
);
|
|
|
|
|
|
|
|
//
|
|
|
|
// check if link contains a carousel
|
|
|
|
//
|
|
|
|
$carousels = $this->parsecarousels();
|
|
|
|
if(count($carousels) !== 0){
|
|
|
|
|
|
|
|
$first = true;
|
|
|
|
foreach($carousels as $carousel_cat){
|
|
|
|
|
|
|
|
foreach($carousel_cat as $carousel){
|
|
|
|
|
|
|
|
if($first === true){
|
|
|
|
|
|
|
|
$first = false;
|
|
|
|
}elseif($carousel["image"] !== null){
|
|
|
|
|
|
|
|
$out["image"][] = [
|
|
|
|
"title" => $carousel["title"],
|
|
|
|
"source" => [
|
|
|
|
[
|
|
|
|
"url" => $carousel["image"],
|
|
|
|
"width" => null,
|
|
|
|
"height" => null
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"url" => $carousel["url"]
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
$web["sublink"][] = [
|
|
|
|
"title" => $carousel["title"],
|
|
|
|
"date" => $carousel["date"],
|
|
|
|
"description" => $carousel["description"],
|
|
|
|
"url" => $carousel["url"]
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if($carousels[0][0]["image"] !== null){
|
|
|
|
$web["thumb"] = [
|
|
|
|
"url" => $carousels[0][0]["image"],
|
|
|
|
"ratio" => "16:9"
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
$out["web"][] = $web;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// no carousel entries, parse as normal link
|
|
|
|
//
|
|
|
|
$this->fuckhtml->load($container);
|
|
|
|
|
|
|
|
// parse URL
|
|
|
|
$web["url"] =
|
|
|
|
$this->unshiturl(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a")
|
|
|
|
[0]
|
|
|
|
["attributes"]
|
|
|
|
["href"]
|
|
|
|
);
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
$container = $container["innerHTML"];
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$line_detect =
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
2023-11-27 07:01:56 +01:00
|
|
|
"height" => "1px",
|
|
|
|
"background-color" => "#dadce0",
|
|
|
|
"margin" => "0 16px"
|
2023-08-27 07:45:59 +02:00
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
2023-11-27 07:01:56 +01:00
|
|
|
);
|
|
|
|
|
|
|
|
if(count($line_detect) !== 0){
|
|
|
|
|
|
|
|
// we found a line, this means we're dealing with a
|
|
|
|
// "featured snippet"
|
|
|
|
$featured = true;
|
|
|
|
|
|
|
|
$description_container =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"white-space" => "pre-line",
|
|
|
|
"word-wrap" => "break-word"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
)[1];
|
|
|
|
|
|
|
|
// get date node for it
|
|
|
|
$date =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("sub");
|
|
|
|
|
|
|
|
if(count($date) !== 0){
|
|
|
|
$web["date"] =
|
|
|
|
strtotime(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$date[0]
|
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// we're dealing with a normal link
|
|
|
|
$featured = false;
|
|
|
|
|
|
|
|
$description_container =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"padding" => "12px 16px 12px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
)[1];
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Get author if we're parsing news
|
|
|
|
//
|
|
|
|
if($pagetype == "news"){
|
|
|
|
|
|
|
|
$author =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"position" => "absolute",
|
|
|
|
"width" => "100%",
|
|
|
|
"top" => "0",
|
|
|
|
"left" => "0",
|
|
|
|
"padding-top" => "1px",
|
|
|
|
"margin-bottom" => "-1px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($author) !== 0){
|
|
|
|
|
|
|
|
$web["author"] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$author[0]
|
|
|
|
);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$web["author"] = null;
|
|
|
|
}
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
|
|
|
|
$description =
|
|
|
|
$description_container["innerHTML"];
|
|
|
|
|
|
|
|
$this->fuckhtml->load($description);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
//
|
|
|
|
// get thumbnail before we call loadhtml again
|
|
|
|
//
|
|
|
|
$img =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("img");
|
|
|
|
|
|
|
|
if(count($img) !== 0){
|
|
|
|
|
|
|
|
$skip = true;
|
|
|
|
|
|
|
|
if(
|
|
|
|
isset($img[0]["attributes"]["alt"]) &&
|
|
|
|
stripos($img[0]["attributes"]["alt"], "Video for") !== false
|
|
|
|
){
|
|
|
|
|
|
|
|
// is a video thumbnail
|
|
|
|
$web["thumb"]["ratio"] = "16:9";
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// is a google thumbnail
|
|
|
|
$web["thumb"]["ratio"] = "1:1";
|
|
|
|
}
|
|
|
|
|
|
|
|
$web["thumb"]["url"] =
|
|
|
|
$this->getimage(
|
|
|
|
$img[0]["attributes"]["id"]
|
|
|
|
);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$skip = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// get sublinks
|
|
|
|
//
|
2023-08-27 07:45:59 +02:00
|
|
|
$links =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
foreach($links as $link){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($skip === true){
|
|
|
|
|
|
|
|
$skip = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
$description =
|
|
|
|
str_replace(
|
|
|
|
$link["outerHTML"],
|
|
|
|
"",
|
|
|
|
$description
|
|
|
|
);
|
|
|
|
|
|
|
|
$sublink = [
|
|
|
|
"title" => null,
|
|
|
|
"description" => null,
|
|
|
|
"url" => null,
|
|
|
|
"date" => null
|
|
|
|
];
|
|
|
|
|
|
|
|
$sublink["title"] =
|
2023-11-27 07:01:56 +01:00
|
|
|
$this->titledots(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$link
|
|
|
|
)
|
2023-08-27 07:45:59 +02:00
|
|
|
);
|
|
|
|
|
|
|
|
$sublink["url"] =
|
2023-11-27 07:01:56 +01:00
|
|
|
$this->unshiturl(
|
2023-08-27 07:45:59 +02:00
|
|
|
$link
|
|
|
|
["attributes"]
|
|
|
|
["href"]
|
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(parse_url($sublink["url"], PHP_URL_HOST) !== null){
|
|
|
|
|
|
|
|
$web["sublink"][] = $sublink;
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
//
|
|
|
|
// Parse spans in description
|
|
|
|
//
|
|
|
|
$this->fuckhtml->load($description);
|
|
|
|
|
|
|
|
if($featured === false){
|
|
|
|
|
|
|
|
$levels =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"padding-bottom" => "8px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
// oh my god yes, fucking great, sometimes there are NO levels
|
|
|
|
// hahahahahhahahahahahahahahahhahaa
|
|
|
|
if(count($levels) === 0){
|
|
|
|
|
|
|
|
$levels = [$description];
|
|
|
|
}
|
|
|
|
|
|
|
|
foreach($levels as $level){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($level);
|
|
|
|
|
|
|
|
$spans =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName(
|
|
|
|
"span"
|
|
|
|
);
|
|
|
|
|
|
|
|
$is_rating = -1;
|
|
|
|
|
|
|
|
foreach($spans as $span){
|
|
|
|
|
|
|
|
$innertext =
|
|
|
|
trim(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$span
|
|
|
|
),
|
|
|
|
" ·."
|
|
|
|
);
|
|
|
|
|
|
|
|
if($innertext == ""){ continue; }
|
|
|
|
|
|
|
|
if(
|
|
|
|
strtolower($innertext)
|
|
|
|
== "rating"
|
|
|
|
){
|
|
|
|
|
|
|
|
$is_rating = 0;
|
|
|
|
|
|
|
|
// clean up before we go
|
|
|
|
$description =
|
|
|
|
str_replace(
|
|
|
|
$span["outerHTML"],
|
|
|
|
"",
|
|
|
|
$description
|
|
|
|
);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Parse rating object
|
|
|
|
//
|
|
|
|
if($is_rating >= 0){
|
|
|
|
|
|
|
|
// clean up description
|
|
|
|
$description =
|
|
|
|
str_replace(
|
|
|
|
$span["outerHTML"],
|
|
|
|
"",
|
|
|
|
$description
|
|
|
|
);
|
|
|
|
|
|
|
|
if($span["level"] !== 1){ continue; }
|
|
|
|
$is_rating++;
|
|
|
|
|
|
|
|
// 10/10 (123)
|
|
|
|
if($is_rating === 1){
|
|
|
|
|
|
|
|
$innertext = explode(" ", $innertext, 2);
|
|
|
|
|
|
|
|
$web["table"]["Rating"] = $innertext[0];
|
|
|
|
|
|
|
|
if(count($innertext) === 2){
|
|
|
|
$web["table"]["Hits"] =
|
|
|
|
trim(
|
|
|
|
str_replace(
|
|
|
|
[
|
|
|
|
"(",
|
|
|
|
")"
|
|
|
|
],
|
|
|
|
"",
|
|
|
|
$innertext[1]
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
if($web["table"]["Hits"] == ""){
|
|
|
|
|
|
|
|
unset($web["table"]["Hits"]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// US$4.99
|
|
|
|
// MYR 50.00
|
|
|
|
// $38.34
|
|
|
|
// JP¥6,480
|
|
|
|
// Reviewed by your mom
|
|
|
|
if($is_rating === 2){
|
|
|
|
|
|
|
|
if(
|
|
|
|
preg_match(
|
|
|
|
'/^Review by (.+)/',
|
|
|
|
$innertext,
|
|
|
|
$match
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
$web["table"]["Author"] = $match[1];
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$web["table"]["Price"] = $innertext;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Android / In stock
|
|
|
|
if($is_rating === 3){
|
|
|
|
|
|
|
|
$web["table"]["Support"] = $innertext;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// ignore the rest
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Parse standalone text
|
|
|
|
//
|
|
|
|
|
|
|
|
// If we reach this point:
|
|
|
|
// 1. Ratings have been parsed
|
|
|
|
// 2. We're parsing a WEB link, not some shitty piece of shit
|
|
|
|
|
|
|
|
// check for date
|
|
|
|
// if span has no text before it, assume it's a date
|
|
|
|
$desc_split =
|
|
|
|
explode(
|
|
|
|
$span["outerHTML"],
|
|
|
|
$description,
|
|
|
|
2
|
|
|
|
);
|
|
|
|
|
|
|
|
if(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$desc_split[0]
|
|
|
|
) == ""
|
|
|
|
){
|
|
|
|
|
|
|
|
// has no text before
|
|
|
|
$date = strtotime($innertext);
|
|
|
|
if($date){
|
|
|
|
|
|
|
|
$web["date"] = $date;
|
|
|
|
}
|
|
|
|
|
|
|
|
// cleanup
|
|
|
|
$description =
|
|
|
|
str_replace(
|
|
|
|
$span["outerHTML"],
|
|
|
|
"",
|
|
|
|
$description
|
|
|
|
);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ready to parse table
|
|
|
|
if(count($desc_split) === 2){
|
|
|
|
$this->fuckhtml->load($desc_split[1]);
|
|
|
|
|
|
|
|
$web["table"][
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
trim($desc_split[0], ": ")
|
|
|
|
)
|
|
|
|
] = $innertext;
|
|
|
|
|
|
|
|
// cleanup
|
|
|
|
$description =
|
|
|
|
str_replace(
|
|
|
|
$desc_split[0] . $span["outerHTML"],
|
|
|
|
"",
|
|
|
|
$description
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$web["description"] =
|
|
|
|
trim(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$description
|
|
|
|
),
|
|
|
|
" ·."
|
|
|
|
);
|
|
|
|
|
|
|
|
if($web["description"] == ""){
|
|
|
|
|
|
|
|
$web["description"] = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$out["web"][] = $web;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Detect wikipedia shit
|
|
|
|
//
|
|
|
|
$wiki_title =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("h3");
|
|
|
|
|
|
|
|
if(count($wiki_title) !== 0){
|
|
|
|
|
|
|
|
$description_after = [];
|
|
|
|
$description = [];
|
|
|
|
$table = [];
|
|
|
|
$sublink = [];
|
|
|
|
|
|
|
|
$as =
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
2023-11-27 07:01:56 +01:00
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
foreach($as as $a){
|
|
|
|
|
|
|
|
if(
|
|
|
|
isset($a["attributes"]["href"]) &&
|
|
|
|
parse_url($a["attributes"]["href"], PHP_URL_HOST) == "maps.google.com"
|
|
|
|
){
|
|
|
|
|
|
|
|
// detected maps embed, ignore
|
|
|
|
continue 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// get carousels and remove them from container for image grepper
|
|
|
|
$carousels = $this->parsecarousels($container["innerHTML"]);
|
|
|
|
$this->fuckhtml->load($container);
|
|
|
|
|
|
|
|
// add images to image tab, if applicable
|
|
|
|
for($i=0; $i<count($carousels); $i++){
|
|
|
|
|
|
|
|
foreach($carousels[$i] as $item){
|
|
|
|
|
|
|
|
if(
|
|
|
|
$item["url"] !== null &&
|
|
|
|
$item["ref"] !== null &&
|
|
|
|
$item["image"] !== null &&
|
|
|
|
$item["title"] !== null
|
|
|
|
){
|
|
|
|
|
|
|
|
$out["image"][] = [
|
|
|
|
"title" => $item["title"],
|
|
|
|
"source" => [
|
|
|
|
[
|
|
|
|
"url" => $item["url"],
|
|
|
|
"width" => $item["image_width"],
|
|
|
|
"height" => $item["image_height"]
|
|
|
|
],
|
|
|
|
[
|
|
|
|
"url" => $item["image"],
|
|
|
|
"width" => $item["thumb_width"],
|
|
|
|
"height" => $item["thumb_height"]
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"url" => $item["ref"]
|
|
|
|
];
|
|
|
|
|
|
|
|
unset($carousels[$i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$carousels = array_values($carousels);
|
|
|
|
|
|
|
|
// interpret remaining carousels as title + carousel
|
|
|
|
$titles =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"font-weight" => "700",
|
|
|
|
"letter-spacing" => "0.75px",
|
|
|
|
"text-transform" => "uppercase"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
)
|
|
|
|
);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
for($i=0; $i<count($titles); $i++){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(!isset($carousels[$i])){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
break;
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$description_after[] = [
|
|
|
|
"type" => "title",
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$titles[$i]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
|
|
|
|
foreach($carousels[$i] as $carousel){
|
|
|
|
|
|
|
|
$description_after[] = [
|
|
|
|
"type" => "link",
|
|
|
|
"url" => "web?s=" . urlencode($carousel["description"]) . "&scraper=google",
|
|
|
|
"value" => $carousel["description"]
|
|
|
|
];
|
|
|
|
|
|
|
|
if($carousel["subtext"] !== null){
|
|
|
|
|
|
|
|
$description_after[] = [
|
|
|
|
"type" => "quote",
|
|
|
|
"value" => $carousel["subtext"]
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
$description_after[] = [
|
|
|
|
"type" => "image",
|
|
|
|
"url" => $carousel["image"]
|
|
|
|
];
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$categories =
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
2023-11-27 07:01:56 +01:00
|
|
|
"padding" => "12px 16px 12px"
|
2023-08-27 07:45:59 +02:00
|
|
|
],
|
|
|
|
self::is_class
|
2023-11-27 07:01:56 +01:00
|
|
|
)
|
2023-08-27 07:45:59 +02:00
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$image =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("img");
|
|
|
|
|
|
|
|
if(count($image) !== 0){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$image = $this->getimage($image[0]["attributes"]["id"]);
|
|
|
|
}else{
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$image = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$url = null;
|
|
|
|
|
|
|
|
for($i=0; $i<count($categories); $i++){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$this->fuckhtml->load($categories[$i]);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($i === 0){
|
|
|
|
// first node. this should be the header with the small
|
|
|
|
// information snippet
|
|
|
|
|
|
|
|
$url =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
if(count($url) !== 0){
|
|
|
|
|
|
|
|
$url =
|
|
|
|
$this->unshiturl(
|
|
|
|
$url[0]["attributes"]["href"]
|
|
|
|
);
|
|
|
|
|
|
|
|
if(parse_url($url, PHP_URL_HOST) == "encrypted-tbn0.gstatic.com"){
|
|
|
|
|
|
|
|
$image = $url;
|
|
|
|
$url = null;
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$url = null;
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$categories[$i]["innerHTML"] =
|
2023-08-27 07:45:59 +02:00
|
|
|
str_replace(
|
2023-11-27 07:01:56 +01:00
|
|
|
$wiki_title[0]["outerHTML"],
|
2023-08-27 07:45:59 +02:00
|
|
|
"",
|
2023-11-27 07:01:56 +01:00
|
|
|
$categories[$i]["innerHTML"]
|
2023-08-27 07:45:59 +02:00
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$subtext =
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
2023-11-27 07:01:56 +01:00
|
|
|
$categories[$i]["innerHTML"]
|
2023-08-27 07:45:59 +02:00
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(strlen($subtext) !== 0){
|
|
|
|
|
|
|
|
$description[] = [
|
|
|
|
"type" => "quote",
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$categories[$i]["innerHTML"]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// detect audio file
|
|
|
|
$audio =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("audio");
|
|
|
|
|
|
|
|
if(count($audio) !== 0){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$description[] = [
|
|
|
|
"type" => "audio",
|
|
|
|
"url" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$audio[0]["attributes"]["src"]
|
|
|
|
)
|
|
|
|
];
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
2023-11-27 07:01:56 +01:00
|
|
|
}else{
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// check for separator elements IN THERE
|
|
|
|
$separators =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"white-space" => "pre-line",
|
|
|
|
"word-wrap" => "break-word"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// detect container type
|
|
|
|
foreach($separators as $separator){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$this->fuckhtml->load($separator);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// ignore wrong levels
|
|
|
|
if($separator["level"] !== 2){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Detect word definition
|
|
|
|
//
|
|
|
|
$wordwraps =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"padding-bottom" => "12px"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(count($wordwraps) !== 0){
|
|
|
|
|
|
|
|
foreach($wordwraps as $word){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($word);
|
|
|
|
|
|
|
|
// detect title
|
|
|
|
$span =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName(
|
|
|
|
"span"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(
|
|
|
|
count($span) === 1 &&
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
str_replace(
|
|
|
|
$span[0]["outerHTML"],
|
|
|
|
"",
|
|
|
|
$word["innerHTML"]
|
|
|
|
)
|
|
|
|
) == ""
|
|
|
|
){
|
|
|
|
|
|
|
|
$description[] = [
|
|
|
|
"type" => "title",
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$span[0]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// detect list element
|
|
|
|
$lists =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("ol");
|
|
|
|
|
|
|
|
if(count($lists) !== 0){
|
|
|
|
foreach($lists as $list){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($list);
|
|
|
|
|
|
|
|
$items =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("li");
|
|
|
|
|
|
|
|
$w = 0;
|
|
|
|
foreach($items as $item){
|
|
|
|
|
|
|
|
$w++;
|
|
|
|
$this->fuckhtml->load($item);
|
|
|
|
|
|
|
|
// get subnodes
|
|
|
|
$subnodes =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"white-space" => "pre-line",
|
|
|
|
"word-wrap" => "break-word"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($subnodes as $subnode){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($subnode);
|
|
|
|
|
|
|
|
$spans =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("span");
|
|
|
|
|
|
|
|
if(count($spans) !== 0){
|
|
|
|
|
|
|
|
// append quote
|
|
|
|
$description[] = [
|
|
|
|
"type" => "quote",
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$subnode
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// append text
|
|
|
|
$description[] = [
|
|
|
|
"type" => "text",
|
|
|
|
"value" =>
|
|
|
|
$w . ". " .
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$subnode
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// parse without list
|
|
|
|
// get subnodes
|
|
|
|
$subnodes =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"white-space" => "pre-line",
|
|
|
|
"word-wrap" => "break-word"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($subnodes as $subnode){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($subnode);
|
|
|
|
|
|
|
|
$spans =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("span");
|
|
|
|
|
|
|
|
if(count($spans) !== 0){
|
|
|
|
|
|
|
|
// append quote
|
|
|
|
$description[] = [
|
|
|
|
"type" => "quote",
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$subnode
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// append text
|
|
|
|
$description[] = [
|
|
|
|
"type" => "text",
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$subnode
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
|
|
|
|
//
|
|
|
|
// Parse table
|
|
|
|
//
|
|
|
|
$spans =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("span");
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
foreach($spans as $span){
|
|
|
|
|
|
|
|
if(!isset($span["attributes"]["class"])){
|
|
|
|
|
|
|
|
// found table
|
|
|
|
$row =
|
|
|
|
explode(
|
|
|
|
":",
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$separator
|
|
|
|
),
|
|
|
|
2
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($row) === 2){
|
|
|
|
|
|
|
|
$table[rtrim($row[0])] =
|
|
|
|
ltrim($row[1]);
|
|
|
|
|
|
|
|
}
|
|
|
|
continue 2;
|
|
|
|
}
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
//
|
|
|
|
// Parse normal description
|
|
|
|
//
|
|
|
|
$links_rem =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
foreach($links_rem as $rem){
|
|
|
|
|
|
|
|
$separator["innerHTML"] =
|
2023-08-27 07:45:59 +02:00
|
|
|
str_replace(
|
2023-11-27 07:01:56 +01:00
|
|
|
$rem["outerHTML"],
|
2023-08-27 07:45:59 +02:00
|
|
|
"",
|
2023-11-27 07:01:56 +01:00
|
|
|
$separator["innerHTML"]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
$description[] = [
|
|
|
|
"type" => "text",
|
|
|
|
"value" =>
|
|
|
|
rtrim(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$separator
|
|
|
|
),
|
|
|
|
" .,"
|
2023-08-27 07:45:59 +02:00
|
|
|
)
|
2023-11-27 07:01:56 +01:00
|
|
|
];
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
2023-11-27 07:01:56 +01:00
|
|
|
}
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// detect huge buttons
|
|
|
|
$buttons =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"display" => "table-cell",
|
|
|
|
"vertical-align" => "middle",
|
|
|
|
"height" => "52px",
|
|
|
|
"text-align" => "center"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"a"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($buttons) !== 0){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
foreach($buttons as $button){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(isset($button["attributes"]["href"])){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$sublink[
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$button
|
|
|
|
)
|
|
|
|
] =
|
|
|
|
$this->unshiturl(
|
|
|
|
$button["attributes"]["href"]
|
|
|
|
);
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// append description_after (contains carousel info)
|
|
|
|
$description = array_merge(
|
|
|
|
$description,
|
|
|
|
$description_after
|
|
|
|
);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$out["answer"][] = [
|
|
|
|
"title" =>
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
2023-11-27 07:01:56 +01:00
|
|
|
$wiki_title[0]
|
2023-08-27 07:45:59 +02:00
|
|
|
),
|
2023-11-27 07:01:56 +01:00
|
|
|
"description" => $description,
|
|
|
|
"url" => $url,
|
|
|
|
"thumb" => $image,
|
|
|
|
"table" => $table,
|
|
|
|
"sublink" => $sublink
|
|
|
|
];
|
2023-08-27 07:45:59 +02:00
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
//
|
|
|
|
// Detect related searches containers
|
|
|
|
//
|
2023-08-27 07:45:59 +02:00
|
|
|
$container_title =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"color" => "#000",
|
2024-03-15 03:33:01 +01:00
|
|
|
"font-size" => "16px",
|
|
|
|
"font-weight" => "bold",
|
2023-08-27 07:45:59 +02:00
|
|
|
"margin" => "0",
|
2024-03-15 03:33:01 +01:00
|
|
|
"padding" => "12px 16px 0px 16px"
|
2023-08-27 07:45:59 +02:00
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($container_title) !== 0){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// get carousel entries
|
|
|
|
$carousels = $this->parsecarousels($container["innerHTML"]);
|
|
|
|
$this->fuckhtml->load($container);
|
|
|
|
|
|
|
|
foreach($carousels as $carousel){
|
|
|
|
|
|
|
|
foreach($carousel as $item){
|
|
|
|
|
|
|
|
if($item["url"] !== null){
|
|
|
|
|
|
|
|
$out["related"][] = $item["url"];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
$container_title =
|
|
|
|
strtolower(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$container_title[0]
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
switch($container_title){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
case "related searches":
|
|
|
|
case "people also search for":
|
|
|
|
//
|
|
|
|
// Parse related searches
|
|
|
|
//
|
|
|
|
$as =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
foreach($as as $a){
|
|
|
|
|
|
|
|
$out["related"][] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent($a);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "people also ask":
|
|
|
|
// get related queries
|
|
|
|
$divs =
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
2023-11-27 07:01:56 +01:00
|
|
|
->getElementsByTagName("div");
|
|
|
|
|
|
|
|
foreach($divs as $div){
|
|
|
|
|
|
|
|
// add accdef's here
|
|
|
|
if($has_appended_accdef === false){
|
|
|
|
|
|
|
|
$out["web"] = array_merge($out["web"], $accdefs);
|
|
|
|
$has_appended_accdef = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// add accdef's questions
|
|
|
|
if(isset($div["attributes"]["role"])){
|
|
|
|
|
|
|
|
$out["related"][] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent($div);
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-11-07 14:04:56 +01:00
|
|
|
//
|
2023-11-27 07:01:56 +01:00
|
|
|
// Parse news
|
2023-11-07 14:04:56 +01:00
|
|
|
//
|
2023-11-27 07:01:56 +01:00
|
|
|
$title =
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
2023-11-27 07:01:56 +01:00
|
|
|
"font-size" => "16px",
|
|
|
|
"line-height" => "20px",
|
|
|
|
"font-weight" => "400"
|
2023-08-27 07:45:59 +02:00
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(count($title) !== 0){
|
|
|
|
|
|
|
|
$carousels = $this->parsecarousels();
|
|
|
|
$this->fuckhtml->load($container);
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(count($carousels) === 0){
|
|
|
|
|
|
|
|
// no carousels found
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$title =
|
2023-08-27 07:45:59 +02:00
|
|
|
strtolower(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
2023-11-27 07:01:56 +01:00
|
|
|
$title[0]
|
2023-08-27 07:45:59 +02:00
|
|
|
)
|
|
|
|
);
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(
|
|
|
|
preg_match(
|
|
|
|
'/^latest from|^top stories/',
|
|
|
|
$title
|
|
|
|
)
|
|
|
|
){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// Found news article
|
|
|
|
foreach($carousels[0] as $carousel){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($carousel["image"] !== null){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$thumb = [
|
|
|
|
"url" => $carousel["image"],
|
|
|
|
"ratio" => "16:9"
|
|
|
|
];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$thumb = [
|
|
|
|
"url" => null,
|
|
|
|
"ratio" => null
|
|
|
|
];
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$out["news"][] = [
|
|
|
|
"title" => $carousel["title"],
|
|
|
|
"description" => $carousel["description"],
|
|
|
|
"date" => $carousel["date"],
|
|
|
|
"thumb" => $thumb,
|
|
|
|
"url" => $carousel["url"]
|
|
|
|
];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elseif(
|
|
|
|
$title == "images"
|
|
|
|
){
|
|
|
|
|
|
|
|
foreach($carousels as $carousel){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
foreach($carousel as $item){
|
|
|
|
|
|
|
|
$out["image"][] = [
|
|
|
|
"title" => $item["title"],
|
|
|
|
"source" => [
|
|
|
|
[
|
|
|
|
"url" => $item["url"],
|
|
|
|
"width" => $item["image_width"],
|
|
|
|
"height" => $item["image_height"]
|
|
|
|
],
|
|
|
|
[
|
|
|
|
"url" => $item["image"],
|
|
|
|
"width" => $item["thumb_width"],
|
|
|
|
"height" => $item["thumb_height"]
|
|
|
|
]
|
2023-08-27 07:45:59 +02:00
|
|
|
],
|
2023-11-27 07:01:56 +01:00
|
|
|
"url" => $item["ref"]
|
|
|
|
];
|
|
|
|
}
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
}
|
2023-11-27 07:01:56 +01:00
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Detect nodes with only text + links
|
|
|
|
//
|
|
|
|
|
|
|
|
// ignore elements with <style> tags
|
|
|
|
$style =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("style");
|
|
|
|
|
|
|
|
if(count($style) !== 0){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$as =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
$description = [];
|
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
$pcitems =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"pcitem",
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($pcitems) !== 0){
|
|
|
|
|
|
|
|
// ignore elements with carousels in them
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
foreach($as as $a){
|
|
|
|
|
|
|
|
//
|
|
|
|
// Detect next page
|
|
|
|
//
|
|
|
|
if(
|
|
|
|
isset($a["attributes"]["aria-label"]) &&
|
|
|
|
strtolower($a["attributes"]["aria-label"]) == "next page"
|
|
|
|
){
|
|
|
|
|
|
|
|
$out["npt"] =
|
|
|
|
$this->backend->store(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$a["attributes"]["href"]
|
|
|
|
),
|
|
|
|
$pagetype,
|
|
|
|
$ip
|
|
|
|
);
|
|
|
|
continue 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Parse as text node
|
|
|
|
//
|
|
|
|
$container["innerHTML"] =
|
|
|
|
explode(
|
|
|
|
$a["outerHTML"],
|
|
|
|
$container["innerHTML"],
|
|
|
|
2
|
|
|
|
);
|
|
|
|
|
|
|
|
$before =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$container["innerHTML"][0],
|
|
|
|
false,
|
|
|
|
false
|
|
|
|
);
|
|
|
|
|
|
|
|
// set after
|
|
|
|
if(count($container["innerHTML"]) === 2){
|
|
|
|
|
|
|
|
$container["innerHTML"] =
|
|
|
|
$container["innerHTML"][1];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$container["innerHTML"] = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
if($before != ""){
|
|
|
|
|
|
|
|
$description[] = [
|
|
|
|
"type" => "text",
|
|
|
|
"value" => $before
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
// add link
|
|
|
|
$description[] = [
|
|
|
|
"type" => "link",
|
|
|
|
"url" =>
|
|
|
|
$this->unshiturl(
|
|
|
|
$a["attributes"]
|
|
|
|
["href"]
|
|
|
|
),
|
|
|
|
"value" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$a
|
|
|
|
)
|
|
|
|
];
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($container["innerHTML"] != ""){
|
2023-08-27 07:45:59 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$description[] = [
|
|
|
|
"type" => "text",
|
|
|
|
"value" =>
|
2023-08-27 07:45:59 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
2023-11-27 07:01:56 +01:00
|
|
|
$container["innerHTML"]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
$out["answer"][] = [
|
|
|
|
"title" => "Notice",
|
|
|
|
"description" => $description,
|
|
|
|
"url" => null,
|
|
|
|
"thumb" => null,
|
|
|
|
"table" => [],
|
|
|
|
"sublink" => []
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// remove duplicate web links cause instant answers
|
|
|
|
// sometimes contains duplicates
|
|
|
|
//
|
|
|
|
$c = count($out["web"]);
|
|
|
|
$links = [];
|
|
|
|
|
|
|
|
for($i=0; $i<$c; $i++){
|
|
|
|
|
|
|
|
foreach($links as $link){
|
|
|
|
|
|
|
|
if($out["web"][$i]["url"] == $link){
|
|
|
|
|
|
|
|
unset($out["web"][$i]);
|
|
|
|
continue 2;
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
}
|
2023-11-27 07:01:56 +01:00
|
|
|
|
|
|
|
$links[] = $out["web"][$i]["url"];
|
2023-08-27 07:45:59 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$out["web"] = array_values($out["web"]);
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
return $out;
|
2023-11-27 07:01:56 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-08-14 05:35:08 +02:00
|
|
|
|
|
|
|
|
|
|
|
public function image($get){
|
|
|
|
|
|
|
|
// generate parameters
|
|
|
|
if($get["npt"]){
|
|
|
|
|
2023-11-07 14:04:56 +01:00
|
|
|
[$params, $proxy] =
|
|
|
|
$this->backend->get(
|
|
|
|
$get["npt"],
|
|
|
|
"images"
|
2023-08-14 05:35:08 +02:00
|
|
|
);
|
2023-11-07 14:04:56 +01:00
|
|
|
|
|
|
|
$params = json_decode($params, true);
|
2023-08-14 05:35:08 +02:00
|
|
|
}else{
|
|
|
|
|
|
|
|
$search = $get["s"];
|
2023-11-07 14:04:56 +01:00
|
|
|
if(strlen($search) === 0){
|
|
|
|
|
|
|
|
throw new Exception("Search term is empty!");
|
|
|
|
}
|
|
|
|
|
|
|
|
$proxy = $this->backend->get_ip();
|
2023-08-14 05:35:08 +02:00
|
|
|
$country = $get["country"];
|
|
|
|
$nsfw = $get["nsfw"];
|
|
|
|
$lang = $get["lang"];
|
|
|
|
$time = $get["time"];
|
|
|
|
$size = $get["size"];
|
|
|
|
$ratio = $get["ratio"];
|
|
|
|
$color = $get["color"];
|
|
|
|
$type = $get["type"];
|
|
|
|
$format = $get["format"];
|
|
|
|
$rights = $get["rights"];
|
|
|
|
|
|
|
|
$params = [
|
|
|
|
"q" => $search,
|
|
|
|
"tbm" => "isch"
|
|
|
|
];
|
|
|
|
|
|
|
|
// country
|
|
|
|
if($country != "any"){
|
|
|
|
|
|
|
|
$params["gl"] = $country;
|
|
|
|
}
|
|
|
|
|
|
|
|
// nsfw
|
|
|
|
$params["safe"] = $nsfw == "yes" ? "off" : "active";
|
|
|
|
|
|
|
|
// language
|
|
|
|
if($lang != "any"){
|
|
|
|
|
|
|
|
$params["lr"] = "lang_" . $lang;
|
|
|
|
}
|
|
|
|
|
|
|
|
$tbs = [];
|
|
|
|
|
|
|
|
// time
|
|
|
|
if($time != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "qrd:" . $time;
|
|
|
|
}
|
|
|
|
|
|
|
|
// size
|
|
|
|
if($size != "any"){
|
|
|
|
|
|
|
|
if(
|
|
|
|
in_array(
|
|
|
|
$size,
|
|
|
|
["l", "s", "i"]
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
$tbs[] = "isz:" . $size;
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$tbs[] = "tbz:lt";
|
|
|
|
$tbs[] = "islt:" . $size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// ratio
|
|
|
|
if($ratio != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "iar:" . $ratio;
|
|
|
|
}
|
|
|
|
|
|
|
|
// color
|
|
|
|
if($color != "any"){
|
|
|
|
|
|
|
|
if(
|
|
|
|
in_array(
|
|
|
|
$color,
|
|
|
|
["color", "gray", "trans"]
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
$tbs[] = "ic:" . $color;
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$tbs[] = "ic:specific";
|
|
|
|
$tbs[] = "isc:" . $color;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// type
|
|
|
|
if($type != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "itp:" . $type;
|
|
|
|
}
|
|
|
|
|
|
|
|
// format
|
|
|
|
if($format != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "ift:" . $format;
|
|
|
|
}
|
|
|
|
|
|
|
|
// rights
|
|
|
|
if($rights != "any"){
|
|
|
|
|
|
|
|
$tbs[] = "il:" . $rights;
|
|
|
|
}
|
|
|
|
|
|
|
|
// append tbs
|
|
|
|
if(count($tbs) !== 0){
|
|
|
|
|
|
|
|
$params["tbs"] =
|
|
|
|
implode(",", $tbs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
$handle = fopen("scraper/google-img.html", "r");
|
|
|
|
$html = fread($handle, filesize("scraper/google-img.html"));
|
|
|
|
fclose($handle);*/
|
|
|
|
|
|
|
|
// scrape images
|
|
|
|
try{
|
|
|
|
$html =
|
|
|
|
$this->get(
|
2023-11-07 14:04:56 +01:00
|
|
|
$proxy,
|
2023-08-14 05:35:08 +02:00
|
|
|
"https://www.google.com/search",
|
|
|
|
$params
|
|
|
|
);
|
|
|
|
}catch(Exception $error){
|
|
|
|
|
|
|
|
throw new Exception("Failed to get search page");
|
|
|
|
}
|
|
|
|
|
2024-03-20 15:59:51 +01:00
|
|
|
if($error = $this->detect_sorry($html)){
|
2024-02-25 15:51:18 +01:00
|
|
|
|
2024-03-20 15:59:51 +01:00
|
|
|
throw new Exception($error);
|
2024-02-25 15:51:18 +01:00
|
|
|
}
|
2023-08-14 05:35:08 +02:00
|
|
|
|
|
|
|
$out = [
|
|
|
|
"status" => "ok",
|
|
|
|
"npt" => null,
|
|
|
|
"image" => []
|
|
|
|
];
|
|
|
|
|
|
|
|
$images =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"islrtb isv-r",
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($images as $image){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($image);
|
|
|
|
$img =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("img")[0];
|
|
|
|
|
|
|
|
$og_width = (int)$image["attributes"]["data-ow"];
|
|
|
|
$og_height = (int)$image["attributes"]["data-oh"];
|
|
|
|
$thumb_width = (int)$image["attributes"]["data-tw"];
|
|
|
|
|
|
|
|
$ratio = $og_width / $og_height;
|
|
|
|
|
|
|
|
if(isset($img["attributes"]["data-src"])){
|
|
|
|
|
|
|
|
$src = &$img["attributes"]["data-src"];
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$src = &$img["attributes"]["src"];
|
|
|
|
}
|
|
|
|
|
|
|
|
$thumb_height = floor($thumb_width / $ratio);
|
|
|
|
|
|
|
|
$out["image"][] = [
|
|
|
|
"title" =>
|
|
|
|
$this->titledots(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$image["attributes"]["data-pt"]
|
|
|
|
)
|
|
|
|
),
|
|
|
|
"source" => [
|
|
|
|
[
|
|
|
|
"url" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$image["attributes"]["data-ou"]
|
|
|
|
),
|
|
|
|
"width" => $og_width,
|
|
|
|
"height" => $og_height
|
|
|
|
],
|
|
|
|
[
|
|
|
|
"url" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$src
|
|
|
|
),
|
|
|
|
"width" => $thumb_width,
|
|
|
|
"height" => $thumb_height
|
|
|
|
]
|
|
|
|
],
|
|
|
|
"url" =>
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$image["attributes"]["data-ru"]
|
|
|
|
)
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
|
|
|
// get next page
|
|
|
|
// https://www.google.com/search
|
|
|
|
// ?q=higurashi
|
|
|
|
// &tbm=isch
|
|
|
|
// &async=_id%3Aislrg_c%2C_fmt%3Ahtml
|
|
|
|
// &asearch=ichunklite
|
|
|
|
// &ved=0ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA
|
|
|
|
|
|
|
|
if(count($out["image"]) !== 100){
|
|
|
|
|
|
|
|
// no more results
|
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if($get["npt"]){
|
|
|
|
|
|
|
|
// update nextpage information
|
|
|
|
$params["start"] = (int)$params["start"] + count($out["image"]);
|
|
|
|
$params["ijn"] = (int)$params["ijn"] + 1;
|
|
|
|
|
|
|
|
$out["npt"] =
|
2023-11-07 14:04:56 +01:00
|
|
|
$this->backend->store(
|
2023-08-14 05:35:08 +02:00
|
|
|
json_encode($params),
|
2023-11-07 14:04:56 +01:00
|
|
|
"images",
|
|
|
|
$proxy
|
2023-08-14 05:35:08 +02:00
|
|
|
);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// scrape nextpage information
|
|
|
|
$this->fuckhtml->load($html);
|
|
|
|
|
|
|
|
$ved =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementById("islrg", "div");
|
|
|
|
|
|
|
|
if($ved){
|
|
|
|
|
|
|
|
$ved =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$ved["attributes"]["data-ved"]
|
|
|
|
);
|
|
|
|
|
|
|
|
// &vet=1{$ved}..i (10ahUKEwidjYXJqJSAAxWrElkFHZ07CDwQtDIIQygA..i)
|
|
|
|
|
|
|
|
/*
|
|
|
|
These 2 are handled by us
|
|
|
|
start = start + number of results
|
|
|
|
ijn = current page number
|
|
|
|
*/
|
|
|
|
// &start=100
|
|
|
|
// &ijn=1
|
|
|
|
|
|
|
|
// &imgvl=CAEY7gQgBSj3Aji8VTjXVUC4AUC3AUgAYNdV
|
|
|
|
preg_match(
|
|
|
|
'/var e=\'([A-z0-9]+)\';/',
|
|
|
|
$html,
|
|
|
|
$imgvl
|
|
|
|
);
|
|
|
|
|
2023-09-13 15:01:23 +02:00
|
|
|
if(isset($imgvl[1])){
|
|
|
|
$imgvl = $imgvl[1];
|
|
|
|
|
|
|
|
$params["async"] = "_id:islrg_c,_fmt:html";
|
|
|
|
$params["asearch"] = "ichunklite";
|
|
|
|
$params["ved"] = $ved;
|
|
|
|
$params["vet"] = "1" . $ved . "..i";
|
|
|
|
$params["start"] = 100;
|
|
|
|
$params["ijn"] = 1;
|
|
|
|
$params["imgvl"] = $imgvl;
|
|
|
|
|
|
|
|
$out["npt"] =
|
2023-11-07 14:04:56 +01:00
|
|
|
$this->backend->store(
|
2023-09-13 15:01:23 +02:00
|
|
|
json_encode($params),
|
2023-11-07 14:04:56 +01:00
|
|
|
"images",
|
|
|
|
$proxy
|
2023-09-13 15:01:23 +02:00
|
|
|
);
|
|
|
|
}
|
2023-08-14 05:35:08 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
|
|
|
private function hms2int($time){
|
|
|
|
|
|
|
|
$parts = explode(":", $time, 3);
|
|
|
|
$time = 0;
|
|
|
|
|
|
|
|
if(count($parts) === 3){
|
|
|
|
|
|
|
|
// hours
|
|
|
|
$time = $time + ((int)$parts[0] * 3600);
|
|
|
|
array_shift($parts);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(count($parts) === 2){
|
|
|
|
|
|
|
|
// minutes
|
|
|
|
$time = $time + ((int)$parts[0] * 60);
|
|
|
|
array_shift($parts);
|
|
|
|
}
|
|
|
|
|
|
|
|
// seconds
|
|
|
|
$time = $time + (int)$parts[0];
|
|
|
|
|
|
|
|
return $time;
|
|
|
|
}
|
|
|
|
|
2023-08-27 07:45:59 +02:00
|
|
|
private function parsejavascript($html){
|
2023-08-14 05:35:08 +02:00
|
|
|
|
|
|
|
$this->fuckhtml->load($html);
|
2023-07-22 20:41:14 +02:00
|
|
|
|
|
|
|
$styles =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("style");
|
|
|
|
|
|
|
|
$this->computedstyle = [];
|
2023-08-08 09:09:47 +02:00
|
|
|
$this->ask = [];
|
2023-07-22 20:41:14 +02:00
|
|
|
|
|
|
|
foreach($styles as $style){
|
|
|
|
|
|
|
|
$this->computedstyle =
|
|
|
|
array_merge(
|
|
|
|
$this->computedstyle,
|
|
|
|
$this->parsestyles($style["innerHTML"])
|
2023-11-27 07:01:56 +01:00
|
|
|
);
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// get images in javascript var
|
|
|
|
preg_match(
|
|
|
|
'/google\.ldi=({[^}]+})/',
|
|
|
|
$html,
|
2023-07-27 01:03:06 +02:00
|
|
|
$this->js_image
|
2023-07-22 20:41:14 +02:00
|
|
|
);
|
|
|
|
|
2023-07-27 01:03:06 +02:00
|
|
|
if(count($this->js_image) !== 0){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2023-07-27 01:03:06 +02:00
|
|
|
$this->js_image = json_decode($this->js_image[1], true);
|
2023-07-22 20:41:14 +02:00
|
|
|
}else{
|
|
|
|
|
2023-07-27 01:03:06 +02:00
|
|
|
$this->js_image = [];
|
|
|
|
}
|
|
|
|
|
|
|
|
// additional js_images present in <script> tags
|
|
|
|
// ugh i fucking hate you
|
|
|
|
$scripts =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("script");
|
|
|
|
|
|
|
|
foreach($scripts as $script){
|
|
|
|
|
|
|
|
if(!isset($script["innerHTML"])){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
preg_match_all(
|
2023-11-27 07:01:56 +01:00
|
|
|
'/var s=\'(data:image[^\']+)\';var i=\[(\'[^\;]*\')];/',
|
2023-07-27 01:03:06 +02:00
|
|
|
$script["innerHTML"],
|
|
|
|
$image_grep
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($image_grep[0]) !== 0){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$items = explode(",", $image_grep[2][0]);
|
|
|
|
$value =
|
2023-07-27 01:03:06 +02:00
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$image_grep[1][0]
|
|
|
|
);
|
2023-11-27 07:01:56 +01:00
|
|
|
|
|
|
|
foreach($items as $item){
|
|
|
|
|
|
|
|
$this->js_image[trim($item, "' ")] = $value;
|
|
|
|
}
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
2023-08-08 09:09:47 +02:00
|
|
|
|
|
|
|
// even more javascript crap
|
|
|
|
// "People also ask" node is loaded trough javascript
|
|
|
|
preg_match_all(
|
|
|
|
'/window\.jsl\.dh\(\'([^\']+)\',\'(.+)\'\);/',
|
|
|
|
$script["innerHTML"],
|
|
|
|
$ask_grep
|
|
|
|
);
|
|
|
|
|
|
|
|
for($i=0; $i<count($ask_grep[0]); $i++){
|
|
|
|
|
|
|
|
$this->ask[trim($ask_grep[1][$i])] =
|
2023-11-27 07:01:56 +01:00
|
|
|
$this->fuckhtml->parseJsString(
|
2023-08-08 09:09:47 +02:00
|
|
|
$ask_grep[2][$i]
|
|
|
|
);
|
|
|
|
}
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function findstyles($rules, $is){
|
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
$c = count($rules);
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
foreach($this->computedstyle as $classname => $styles){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
if($classname[0] != $is){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
// not a class, skip
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$i = 0;
|
|
|
|
foreach($styles as $stylename => $stylevalue){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
if(
|
|
|
|
isset($rules[$stylename]) &&
|
|
|
|
$rules[$stylename] == $stylevalue
|
|
|
|
){
|
|
|
|
|
|
|
|
$i++;
|
|
|
|
}else{
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
continue 2;
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
2024-03-15 03:33:01 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if($c === $i){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
return ltrim($classname, $is);
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-15 03:33:01 +01:00
|
|
|
// fail, did not find classname.
|
2023-07-22 20:41:14 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
private function parsestyles($style){
|
|
|
|
|
|
|
|
// get style tags
|
|
|
|
preg_match_all(
|
2024-03-15 03:33:01 +01:00
|
|
|
'/([^{]+){([^}]*)}/',
|
2023-07-22 20:41:14 +02:00
|
|
|
$style,
|
|
|
|
$tags_regex
|
|
|
|
);
|
|
|
|
|
|
|
|
$tags = [];
|
|
|
|
|
|
|
|
for($i=0; $i<count($tags_regex[0]); $i++){
|
|
|
|
|
|
|
|
$tagnames = explode(",", trim($tags_regex[1][$i]));
|
|
|
|
|
|
|
|
foreach($tagnames as $tagname){
|
|
|
|
|
|
|
|
$tagname = trim($tagname);
|
|
|
|
|
|
|
|
if(!isset($tags[$tagname])){
|
|
|
|
$tags[$tagname] = [];
|
|
|
|
}
|
|
|
|
|
|
|
|
$values = explode(";", $tags_regex[2][$i]);
|
|
|
|
|
|
|
|
foreach($values as $value){
|
|
|
|
|
|
|
|
$value = explode(":", $value, 2);
|
|
|
|
|
|
|
|
if(count($value) !== 2){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$tags[$tagname][trim($value[0])] =
|
|
|
|
trim($value[1]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return $tags;
|
|
|
|
}
|
|
|
|
|
2023-07-27 01:03:06 +02:00
|
|
|
private function getimage($id){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(isset($this->js_image[$id])){
|
|
|
|
|
|
|
|
$return = $this->fuckhtml->parseJsString($this->js_image[$id]);
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(
|
|
|
|
$return != "" &&
|
|
|
|
$return != ""
|
|
|
|
){
|
2023-07-27 01:03:06 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if(
|
|
|
|
preg_match(
|
|
|
|
'/^\/\//',
|
|
|
|
$return
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
return 'https:' . $return;
|
|
|
|
}
|
|
|
|
|
|
|
|
return $return;
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private function parsecarousels(&$item_to_remove = false){
|
|
|
|
|
|
|
|
$carousels =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"padding" => "16px",
|
|
|
|
"position" => "relative"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
$return = [];
|
|
|
|
|
|
|
|
for($i=0; $i<count($carousels); $i++){
|
|
|
|
|
|
|
|
if(!isset($carousels[$i]["outerHTML"])){
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
$this->fuckhtml->load($carousels[$i]);
|
|
|
|
|
|
|
|
if($item_to_remove !== false){
|
|
|
|
|
|
|
|
$item_to_remove =
|
|
|
|
str_replace(
|
|
|
|
$carousels[$i]["outerHTML"],
|
|
|
|
"",
|
|
|
|
$item_to_remove
|
|
|
|
);
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$pcitems =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
"pcitem",
|
|
|
|
"div"
|
|
|
|
);
|
|
|
|
|
|
|
|
foreach($pcitems as $pcitem){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($pcitem);
|
|
|
|
|
|
|
|
$out = [
|
|
|
|
"url" => null,
|
|
|
|
"ref" => null,
|
|
|
|
"image" => null,
|
|
|
|
"thumb_width" => null,
|
|
|
|
"thumb_height" => null,
|
|
|
|
"image_width" => null,
|
|
|
|
"image_height" => null,
|
|
|
|
"title" => null,
|
|
|
|
"description" => null,
|
|
|
|
"subtext" => null,
|
|
|
|
"date" => null
|
|
|
|
];
|
|
|
|
|
|
|
|
$url =
|
|
|
|
$this->unshiturl(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a")
|
|
|
|
[0]
|
|
|
|
["attributes"]
|
|
|
|
["href"],
|
|
|
|
true
|
|
|
|
);
|
|
|
|
|
|
|
|
// set ref
|
|
|
|
$out["ref"] = $url["ref"];
|
|
|
|
|
|
|
|
// set url
|
|
|
|
$out["url"] = $url["url"];
|
|
|
|
|
|
|
|
// set sizes
|
|
|
|
$out["thumb_width"] = $url["thumb_width"];
|
|
|
|
$out["thumb_height"] = $url["thumb_height"];
|
|
|
|
$out["image_width"] = $url["image_width"];
|
|
|
|
$out["image_height"] = $url["image_height"];
|
|
|
|
|
|
|
|
// get image
|
|
|
|
$out["image"] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName(
|
|
|
|
"img"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($out["image"]) !== 0){
|
|
|
|
|
|
|
|
// get title from image
|
|
|
|
if(isset($out["image"][0]["attributes"]["alt"])){
|
|
|
|
|
|
|
|
$out["title"] =
|
|
|
|
$this->titledots(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$out["image"][0]["attributes"]["alt"]
|
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// get image url
|
|
|
|
if(isset($out["image"][0]["attributes"]["id"])){
|
|
|
|
|
|
|
|
$out["image"] = $this->getimage($out["image"][0]["attributes"]["id"]);
|
|
|
|
}
|
|
|
|
|
|
|
|
elseif(isset($out["image"][0]["attributes"]["data-ll"])){
|
|
|
|
|
|
|
|
$out["image"] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$out["image"][0]["attributes"]["data-ll"]
|
|
|
|
);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// failed to get image information
|
|
|
|
$out["image"] = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
if($out["image"] == ''){
|
|
|
|
|
|
|
|
// found arrow image base64, skip entry
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}else{
|
|
|
|
|
|
|
|
// Could not find any image in node
|
|
|
|
$out["image"] = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
// get title from spans
|
|
|
|
$title =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"color" => "#1967d2"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
),
|
|
|
|
"span"
|
|
|
|
);
|
|
|
|
|
|
|
|
if(count($title) !== 0){
|
|
|
|
|
|
|
|
$out["title"] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$title[0]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// get textnodes
|
|
|
|
$textnodes =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByClassName(
|
|
|
|
$this->findstyles(
|
|
|
|
[
|
|
|
|
"white-space" => "pre-line",
|
|
|
|
"word-wrap" => "break-word"
|
|
|
|
],
|
|
|
|
self::is_class
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
$subtext = null;
|
|
|
|
|
|
|
|
if(count($textnodes) !== 0){
|
|
|
|
|
|
|
|
// get date
|
|
|
|
$date =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$textnodes[count($textnodes) - 1],
|
|
|
|
true
|
|
|
|
);
|
|
|
|
|
|
|
|
if(str_replace("\n", " ", $date) == $title){
|
|
|
|
|
|
|
|
$date = null;
|
|
|
|
}else{
|
|
|
|
|
|
|
|
if(strpos($date, "\n") !== false){
|
|
|
|
|
|
|
|
$date = explode("\n", $date);
|
|
|
|
$date = $date[count($date) - 1];
|
|
|
|
}
|
|
|
|
elseif(strpos($date, "•") !== false){
|
|
|
|
|
|
|
|
$date = explode("•", $date);
|
|
|
|
$date = ltrim($date[count($date) - 1]);
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$date = null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if($date !== null){
|
|
|
|
|
|
|
|
$date = strtotime($date);
|
|
|
|
}
|
|
|
|
|
|
|
|
// get description
|
|
|
|
$description =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$textnodes[0]
|
|
|
|
);
|
|
|
|
|
|
|
|
if($out["title"] === null){
|
|
|
|
|
|
|
|
if($date === null){
|
|
|
|
|
|
|
|
$out["title"] = $description;
|
|
|
|
$description = null;
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$out["title"] = parse_url($out["url"], PHP_URL_HOST);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if(isset($textnodes[1])){
|
|
|
|
|
|
|
|
$out["subtext"] =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$textnodes[1]
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
}else{
|
|
|
|
|
|
|
|
$date = null;
|
|
|
|
$description = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$out["date"] = $date;
|
|
|
|
$out["description"] = $this->titledots($description);
|
|
|
|
|
|
|
|
if($out["url"] === null){
|
|
|
|
|
|
|
|
$out["url"] = $out["title"];
|
|
|
|
}
|
|
|
|
|
|
|
|
if($out["title"] == $out["description"]){
|
|
|
|
|
|
|
|
$out["description"] = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
$return[$i][] = $out;
|
|
|
|
}
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
return $return;
|
2023-07-27 01:03:06 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
private function unshiturl($url, $return_size = false){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// get parameters from URL
|
|
|
|
$url =
|
2023-07-22 20:41:14 +02:00
|
|
|
$this->fuckhtml
|
2023-11-27 07:01:56 +01:00
|
|
|
->getTextContent($url);
|
|
|
|
|
|
|
|
$newurl = parse_url($url, PHP_URL_QUERY);
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
if($newurl == ""){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// probably telephone number
|
|
|
|
return $url;
|
2023-08-08 09:09:47 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
$url = $newurl;
|
|
|
|
unset($newurl);
|
|
|
|
|
|
|
|
parse_str($url, $query);
|
|
|
|
|
|
|
|
if(isset($query["imgurl"])){
|
|
|
|
|
|
|
|
$url = $query["imgurl"];
|
|
|
|
}
|
|
|
|
elseif(isset($query["q"])){
|
|
|
|
|
|
|
|
$url = $query["q"];
|
|
|
|
}
|
2023-08-08 09:09:47 +02:00
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
// rewrite URLs to remove extra tracking parameters
|
2023-08-08 09:09:47 +02:00
|
|
|
$domain = parse_url($url, PHP_URL_HOST);
|
|
|
|
|
|
|
|
if(
|
|
|
|
preg_match(
|
|
|
|
'/wikipedia.org$/',
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
){
|
2023-07-22 20:41:14 +02:00
|
|
|
|
2023-08-08 09:09:47 +02:00
|
|
|
// rewrite wikipedia mobile URLs to desktop
|
|
|
|
$url =
|
|
|
|
$this->replacedomain(
|
|
|
|
$url,
|
|
|
|
preg_replace(
|
|
|
|
'/([a-z0-9]+)(\.m\.)/',
|
|
|
|
'$1.',
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
);
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
elseif(
|
2023-08-08 09:09:47 +02:00
|
|
|
preg_match(
|
|
|
|
'/imdb\.com$|youtube\.[^.]+$/',
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
// rewrite imdb and youtube mobile URLs too
|
|
|
|
$url =
|
|
|
|
$this->replacedomain(
|
|
|
|
$url,
|
|
|
|
preg_replace(
|
|
|
|
'/^m\./',
|
|
|
|
"",
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
elseif(
|
|
|
|
preg_match(
|
|
|
|
'/play\.google\.[^.]+$/',
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
// remove referrers from play.google.com
|
|
|
|
$oldquery = parse_url($url, PHP_URL_QUERY);
|
|
|
|
if($oldquery !== null){
|
|
|
|
|
|
|
|
parse_str($oldquery, $query);
|
|
|
|
if(isset($query["referrer"])){ unset($query["referrer"]); }
|
|
|
|
if(isset($query["hl"])){ unset($query["hl"]); }
|
|
|
|
if(isset($query["gl"])){ unset($query["gl"]); }
|
|
|
|
|
|
|
|
$query = http_build_query($query);
|
|
|
|
|
|
|
|
$url =
|
|
|
|
str_replace(
|
|
|
|
$oldquery,
|
|
|
|
$query,
|
|
|
|
$url
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elseif(
|
|
|
|
preg_match(
|
|
|
|
'/twitter\.com$/',
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
// remove more referrers from twitter.com
|
|
|
|
$oldquery = parse_url($url, PHP_URL_QUERY);
|
|
|
|
if($oldquery !== null){
|
|
|
|
|
|
|
|
parse_str($oldquery, $query);
|
|
|
|
if(isset($query["ref_src"])){ unset($query["ref_src"]); }
|
|
|
|
|
|
|
|
$query = http_build_query($query);
|
|
|
|
|
|
|
|
if($query != ""){
|
|
|
|
|
|
|
|
$query .= "?" . $query;
|
|
|
|
}
|
|
|
|
|
|
|
|
$url =
|
|
|
|
str_replace(
|
|
|
|
'?' . $oldquery,
|
|
|
|
$query,
|
|
|
|
$url
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elseif(
|
|
|
|
preg_match(
|
|
|
|
'/maps\.google\.[^.]+/',
|
|
|
|
$domain
|
|
|
|
)
|
|
|
|
){
|
|
|
|
|
|
|
|
if(stripos($url, "maps?") !== false){
|
|
|
|
|
|
|
|
//https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7
|
|
|
|
$query = parse_url($url, PHP_URL_QUERY);
|
|
|
|
|
|
|
|
if($query !== null){
|
|
|
|
|
|
|
|
parse_str($query, $query);
|
|
|
|
|
|
|
|
if(isset($query["daddr"])){
|
|
|
|
|
|
|
|
$url =
|
|
|
|
"https://maps.google.com/maps?daddr=" .
|
|
|
|
urlencode($query["daddr"]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if($return_size){
|
|
|
|
|
|
|
|
return [
|
|
|
|
"url" => $url,
|
|
|
|
"ref" => isset($query["imgrefurl"]) ? $query["imgrefurl"] : null,
|
|
|
|
"thumb_width" => isset($query["tbnw"]) ? (int)$query["tbnw"] : null,
|
|
|
|
"thumb_height" => isset($query["tbnh"]) ? (int)$query["tbnh"] : null,
|
|
|
|
"image_width" => isset($query["w"]) ? (int)$query["w"] : null,
|
|
|
|
"image_height" => isset($query["h"]) ? (int)$query["h"] : null
|
|
|
|
];
|
|
|
|
}
|
|
|
|
|
2023-08-08 09:09:47 +02:00
|
|
|
return $url;
|
|
|
|
}
|
|
|
|
|
|
|
|
private function replacedomain($url, $domain){
|
|
|
|
|
|
|
|
return
|
|
|
|
preg_replace(
|
|
|
|
'/(https?:\/\/)([^\/]+)/',
|
|
|
|
'$1' . $domain,
|
|
|
|
$url
|
|
|
|
);
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private function titledots($title){
|
|
|
|
|
2023-11-27 07:01:56 +01:00
|
|
|
return rtrim($title, ". \t\n\r\0\x0B");
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|
2024-02-25 15:51:18 +01:00
|
|
|
|
|
|
|
private function detect_sorry($html){
|
|
|
|
|
|
|
|
$this->fuckhtml->load($html);
|
|
|
|
$detect_sorry =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("title");
|
|
|
|
|
|
|
|
if(
|
|
|
|
isset($detect_sorry[0]) &&
|
|
|
|
$detect_sorry[0]["innerHTML"] == "302 Moved"
|
|
|
|
){
|
|
|
|
|
2024-03-20 15:59:51 +01:00
|
|
|
// may be consent.google.com in europe or /sorry captcha page
|
|
|
|
$url =
|
|
|
|
$this->fuckhtml
|
|
|
|
->getElementsByTagName("a");
|
|
|
|
|
|
|
|
if(
|
|
|
|
strpos(
|
|
|
|
parse_url(
|
|
|
|
$this->fuckhtml
|
|
|
|
->getTextContent(
|
|
|
|
$url[0]["attributes"]["href"]
|
|
|
|
),
|
|
|
|
PHP_URL_PATH
|
|
|
|
),
|
|
|
|
"/sorry"
|
|
|
|
) === 0
|
|
|
|
){
|
|
|
|
|
|
|
|
// found /sorry
|
|
|
|
return "Google blocked this 4get instance. Please setup a proxy!";
|
|
|
|
}
|
|
|
|
|
|
|
|
// found consent.google, should not happen anymore
|
|
|
|
return "Google served a GPDR consent form. This should not happen, please report if you encounter this message";
|
2024-02-25 15:51:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
2023-07-22 20:41:14 +02:00
|
|
|
}
|