diff --git a/src/main/kotlin/org/koitharu/kotatsu/parsers/site/all/HitomiLaParser.kt b/src/main/kotlin/org/koitharu/kotatsu/parsers/site/all/HitomiLaParser.kt new file mode 100644 index 00000000..caf525bc --- /dev/null +++ b/src/main/kotlin/org/koitharu/kotatsu/parsers/site/all/HitomiLaParser.kt @@ -0,0 +1,596 @@ +package org.koitharu.kotatsu.parsers.site.all + +import kotlinx.coroutines.* +import kotlinx.coroutines.sync.* +import okhttp3.Headers +import org.json.* +import org.koitharu.kotatsu.parsers.* +import org.koitharu.kotatsu.parsers.config.ConfigKey +import org.koitharu.kotatsu.parsers.model.* +import org.koitharu.kotatsu.parsers.util.* +import org.koitharu.kotatsu.parsers.util.json.getStringOrNull +import org.koitharu.kotatsu.parsers.util.json.mapJSON +import java.nio.ByteBuffer +import java.nio.ByteOrder +import java.security.MessageDigest +import java.text.SimpleDateFormat +import java.util.* +import kotlin.math.min + +@OptIn(ExperimentalUnsignedTypes::class) +@MangaSourceParser("HITOMILA", "Hitomi.La", type = ContentType.HENTAI) +class HitomiLaParser(context: MangaLoaderContext) : MangaParser(context, MangaSource.HITOMILA) { + + override val configKeyDomain = ConfigKey.Domain("hitomi.la") + + private val ltnBaseUrl get() = "https://${getDomain("ltn")}" + + override val availableSortOrders: Set = EnumSet.of( + SortOrder.NEWEST, + SortOrder.POPULARITY, + ) + + override suspend fun getAvailableTags(): Set { + return coroutineScope { + ('a'..'z').map { alphabet -> + async { + val doc = webClient.httpGet("https://$domain/alltags-$alphabet.html").parseHtml() + + doc.select(".posts > li").mapNotNull { element -> + val num = element.ownText().let { + Regex("""\((\d+)\)""").find(it)?.groupValues?.get(1)?.toIntOrNull() ?: 0 + } + + if (num > 100) { + val url = element.selectFirst("a") + val href = url?.attrAsRelativeUrl("href") + ?: return@mapNotNull null + + MangaTag( + title = url.ownText().toCamelCase(), + key = href.tagUrlToTag(), + source = source, + ) + } else { + null + } + } + } + }.awaitAll().flatten().toSet() + } + } + + private var cachedSearchIds: List = emptyList() + + override suspend fun getList(offset: Int, filter: MangaListFilter?): List { + return when (filter) { + is MangaListFilter.Advanced -> { + if (filter.tags.isEmpty()) { + when (filter.sortOrder) { + SortOrder.POPULARITY -> { + getGalleryIDsFromNozomi("popular", "today", "all", offset.nextOffsetRange()) + } + + else -> { + getGalleryIDsFromNozomi(null, "index", "all", offset.nextOffsetRange()) + } + } + } else { + if (offset == 0) { + cachedSearchIds = hitomiSearch( + filter.tags.joinToString(" ") { it.key }, + filter.sortOrder == SortOrder.POPULARITY + ).toList() + } + cachedSearchIds.subList(offset, min(offset+25, cachedSearchIds.size)) + } + } + + is MangaListFilter.Search -> { + if (offset == 0) { + cachedSearchIds = hitomiSearch( + filter.query, + filter.sortOrder == SortOrder.POPULARITY + ).toList() + } + cachedSearchIds.subList(offset, min(offset+25, cachedSearchIds.size)) + } + + else -> getGalleryIDsFromNozomi(null, "popular", "all", offset.nextOffsetRange()) + }.toMangaList() + } + + private fun Int.nextOffsetRange(): LongRange { + val bytes = this*4L + return bytes.until(bytes+100L) + } + + private suspend fun hitomiSearch(query: String, sortByPopularity: Boolean = false) : Set = coroutineScope { + val terms = query + .trim() + .replace(Regex("""^\?"""), "") + .lowercase() + .split(Regex("\\s+")) + .map { + it.replace('_', ' ') + } + + val positiveTerms = LinkedList() + val negativeTerms = LinkedList() + + for (term in terms) { + if (term.startsWith("-")) + negativeTerms.push(term.removePrefix("-")) + else if (term.isNotBlank()) + positiveTerms.push(term) + } + + val positiveResults = positiveTerms.map { + async { + runCatching { + getGalleryIDsForQuery(it) + }.getOrDefault(emptySet()) + } + } + + val negativeResults = negativeTerms.map { + async { + runCatching { + getGalleryIDsForQuery(it) + }.getOrDefault(emptySet()) + } + } + + val results = when { + sortByPopularity -> getGalleryIDsFromNozomi(null, "popular", "all") + positiveTerms.isEmpty() -> getGalleryIDsFromNozomi(null, "index", "all") + else -> emptySet() + }.toMutableSet() + + fun filterPositive(newResults: Set) { + when { + results.isEmpty() -> results.addAll(newResults) + else -> results.retainAll(newResults) + } + } + + fun filterNegative(newResults: Set) { + results.removeAll(newResults) + } + + //positive results + positiveResults.forEach { + filterPositive(it.await()) + } + + //negative results + negativeResults.forEach { + filterNegative(it.await()) + } + + results + } + + //search.js + private suspend fun getGalleryIDsForQuery(query: String) : Set { + query.replace("_", " ").let { + if (it.indexOf(':') > -1) { + val sides = it.split(":") + val ns = sides[0] + var tag = sides[1] + + var area : String? = ns + var language = "all" + when (ns) { + "female", "male" -> { + area = "tag" + tag = it + } + "language" -> { + area = null + language = tag + tag = "index" + } + } + + return getGalleryIDsFromNozomi(area, tag, language) + } + + val key = hashTerm(it) + val field = "galleries" + + val node = getNodeAtAddress(field, 0) + + val data = bSearch(field, key, node) + + if (data != null) + return getGalleryIDsFromData(data) + + return emptySet() + } + } + + private suspend fun getGalleryIDsFromData(data: Pair) : Set { + val url = "$ltnBaseUrl/galleriesindex/galleries.${galleriesIndexVersion.get()}.data" + val (offset, length) = data + if (length > 100000000 || length <= 0) + throw Exception("length $length is too long") + + val inbuf = getURLAtRange(url, offset.until(offset+length)) + + val galleryIDs = mutableSetOf() + + val buffer = ByteBuffer + .wrap(inbuf) + .order(ByteOrder.BIG_ENDIAN) + + val numberOfGalleryIDs = buffer.int + + val expectedLength = numberOfGalleryIDs*4+4 + + if (numberOfGalleryIDs > 10000000 || numberOfGalleryIDs <= 0) + throw Exception("number_of_galleryids $numberOfGalleryIDs is too long") + else if (inbuf.size != expectedLength) + throw Exception("inbuf.byteLength ${inbuf.size} != expected_length $expectedLength") + + for (i in 0.until(numberOfGalleryIDs)) + galleryIDs.add(buffer.int) + + return galleryIDs + } + + private suspend fun bSearch(field: String, key: UByteArray, node: Node) : Pair? { + fun compareArrayBuffers(dv1: UByteArray, dv2: UByteArray) : Int { + val top = min(dv1.size, dv2.size) + + for (i in 0.until(top)) { + if (dv1[i] < dv2[i]) + return -1 + else if (dv1[i] > dv2[i]) + return 1 + } + + return 0 + } + + fun locateKey(key: UByteArray, node: Node) : Pair { + for (i in node.keys.indices) { + val cmpResult = compareArrayBuffers(key, node.keys[i]) + + if (cmpResult <= 0) + return Pair(cmpResult==0, i) + } + + return Pair(false, node.keys.size) + } + + fun isLeaf(node: Node) : Boolean { + for (subnode in node.subNodeAddresses) + if (subnode != 0L) + return false + + return true + } + + if (node.keys.isEmpty()) + return null + + val (there, where) = locateKey(key, node) + if (there) + return node.datas[where] + else if (isLeaf(node)) + return null + + val nextNode = getNodeAtAddress(field, node.subNodeAddresses[where]) + + return bSearch(field, key, nextNode) + } + + private suspend fun getGalleryIDsFromNozomi(area: String?, tag: String, language: String, range: LongRange? = null) : Set { + val nozomiAddress = when(area) { + null -> "$ltnBaseUrl/$tag-$language.nozomi" + else -> "$ltnBaseUrl/$area/$tag-$language.nozomi" + } + + val bytes = getURLAtRange(nozomiAddress, range) + val nozomi = mutableSetOf() + + val arrayBuffer = ByteBuffer + .wrap(bytes) + .order(ByteOrder.BIG_ENDIAN) + + while (arrayBuffer.hasRemaining()) + nozomi.add(arrayBuffer.int) + + return nozomi + } + + private val tagIndexVersion = SuspendLazy { getIndexVersion("tagindex") } + private val galleriesIndexVersion = SuspendLazy { getIndexVersion("galleriesindex") } + + private suspend fun getIndexVersion(name: String) = + webClient.httpGet("$ltnBaseUrl/$name/version?_=${System.currentTimeMillis()}").parseRaw() + + private data class Node( + val keys: List, + val datas: List>, + val subNodeAddresses: List, + ) + + private fun decodeNode(data: ByteArray) : Node { + val buffer = ByteBuffer + .wrap(data) + .order(ByteOrder.BIG_ENDIAN) + + val uData = data.toUByteArray() + + val numberOfKeys = buffer.int + val keys = ArrayList() + + for (i in 0.until(numberOfKeys)) { + val keySize = buffer.int + + if (keySize == 0 || keySize > 32) + throw Exception("fatal: !keySize || keySize > 32") + + keys.add(uData.sliceArray(buffer.position().until(buffer.position()+keySize))) + buffer.position(buffer.position()+keySize) + } + + val numberOfDatas = buffer.int + val datas = ArrayList>() + + for (i in 0.until(numberOfDatas)) { + val offset = buffer.long + val length = buffer.int + + datas.add(Pair(offset, length)) + } + + val numberOfSubNodeAddresses = 16 + 1 + val subNodeAddresses = ArrayList() + + for (i in 0.until(numberOfSubNodeAddresses)) { + val subNodeAddress = buffer.long + subNodeAddresses.add(subNodeAddress) + } + + return Node(keys, datas, subNodeAddresses) + } + + private suspend fun getNodeAtAddress(field: String, address: Long) : Node { + val url = + when(field) { + "galleries" -> "$ltnBaseUrl/galleriesindex/galleries.${galleriesIndexVersion.get()}.index" + "languages" -> "$ltnBaseUrl/galleriesindex/languages.${galleriesIndexVersion.get()}.index" + "nozomiurl" -> "$ltnBaseUrl/galleriesindex/nozomiurl.${galleriesIndexVersion.get()}.index" + else -> "$ltnBaseUrl/tagindex/$field.${tagIndexVersion.get()}.index" + } + + val nodedata = getURLAtRange(url, address.until(address + 464)) + + return decodeNode(nodedata) + } + + private suspend fun getURLAtRange(url: String, range: LongRange? = null) : ByteArray { + val rangeHeaders = when (range) { + null -> Headers.headersOf() + else -> Headers.headersOf("Range", "bytes=${range.first}-${range.last}") + } + + return webClient.httpGet(url, rangeHeaders).parseBytes() + } + + private fun hashTerm(term: String) : UByteArray { + return sha256(term.toByteArray()).copyOfRange(0, 4).toUByteArray() + } + + private fun sha256(data: ByteArray) : ByteArray { + return MessageDigest.getInstance("SHA-256").digest(data) + } + + private suspend fun Collection.toMangaList(): List { + return coroutineScope { + map { id -> + async { + runCatching { + val doc = webClient.httpGet("$ltnBaseUrl/galleryblock/$id.html").parseHtml() + + Manga( + id = generateUid(id.toString()), + title = doc.selectFirstOrThrow("h1").text(), + url = id.toString(), + coverUrl = "https:" + doc.selectFirstOrThrow("picture > source") + .attr("data-srcset") + .substringBefore(" "), + publicUrl = doc.selectFirstOrThrow("h1 > a") + .attrAsRelativeUrl("href") + .toAbsoluteUrl(domain), + author = null, + tags = emptySet(), + isNsfw = true, + rating = RATING_UNKNOWN, + altTitle = null, + state = null, + source = source, + ) + }.getOrNull() + } + }.awaitAll().filterNotNull() + } + } + + override suspend fun getDetails(manga: Manga): Manga { + val json = webClient.httpGet("$ltnBaseUrl/galleries/${manga.url}.js") + .parseRaw() + .substringAfter("var galleryinfo = ") + .let(::JSONObject) + + return manga.copy( + title = json.getString("title"), + largeCoverUrl = json.getJSONArray("files").getJSONObject(0).let { + val hash = it.getString("hash") + val commonId = commonImageId() + val imageId = imageIdFromHash(hash) + val subDomain = 'a' + subdomainOffset(imageId) + + "https://${getDomain("${subDomain}a")}/webp/$commonId$imageId/$hash.webp" + }, + author = json.optJSONArray("artists") + ?.mapJSON { it.getString("artist").toCamelCase() } + ?.joinToString(), + publicUrl = json.getString("galleryurl").toAbsoluteUrl(domain), + tags = buildSet { + json.optJSONArray("characters") + ?.mapToTags("character") + ?.let(::addAll) + json.optJSONArray("tags") + ?.mapToTags("tag") + ?.let(::addAll) + json.optJSONArray("artists") + ?.mapToTags("artist") + ?.let(::addAll) + json.optJSONArray("parodys") + ?.mapToTags("parody") + ?.let(::addAll) + json.optJSONArray("groups") + ?.mapToTags("group") + ?.let(::addAll) + }, + chapters = listOf( + MangaChapter( + id = generateUid(manga.url), + url = manga.url, + name = json.getString("title"), + scanlator = json.getString("type").toTitleCase(), + number = 1, + branch = json.getString("language_localname"), + source = source, + uploadDate = dateFormat.tryParse(json.getString("date").substringBeforeLast("-")), + ) + ) + ) + } + + companion object { + private val dateFormat = SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH) + } + + private fun JSONArray.mapToTags(key: String): Set { + val tags = mutableSetOf() + mapJSON { + MangaTag( + title = it.getString(key).toCamelCase().let { title -> + if (it.getStringOrNull("female")?.toIntOrNull() == 1) { + "$title ♀" + } else if (it.getStringOrNull("male")?.toIntOrNull() == 1) { + "$title ♂" + } else { + title + } + }, + key = it.getString("url").tagUrlToTag(), + source = source + ).let(tags::add) + } + return tags + } + + private fun String.tagUrlToTag(): String { + val urlContent = this.split("/") + val ns = urlContent[1] + val tag = urlContent[2] + .substringBeforeLast("-") + .urlDecode() + .replace(" ", "_") + + return if (tag.split(":")[0] in listOf("female", "male")) { + tag + } else { + "$ns:$tag" + } + } + + override suspend fun getRelatedManga(seed: Manga): List { + val json = webClient.httpGet("$ltnBaseUrl/galleries/${seed.url}.js") + .parseRaw() + .substringAfter("var galleryinfo = ") + .let(::JSONObject) + + // any better way to get List from this json? + return json.getJSONArray("related").let { + 0.until(it.length()).map { i -> it.getInt(i) } + }.toMangaList() + } + + override suspend fun getPages(chapter: MangaChapter): List { + val json = webClient.httpGet("$ltnBaseUrl/galleries/${chapter.url}.js") + .parseRaw() + .substringAfter("var galleryinfo = ") + .let(::JSONObject) + + return json.getJSONArray("files").mapJSON { image -> + val hash = image.getString("hash") + val commonId = commonImageId() + val imageId = imageIdFromHash(hash) + val subDomain = 'a' + subdomainOffset(imageId) + + MangaPage( + id= generateUid(hash), + url = "https://${getDomain("${subDomain}a")}/webp/$commonId$imageId/$hash.webp", + preview = "https://${getDomain("${subDomain}tn")}/webpsmalltn/${thumbPathFromHash(hash)}/$hash.webp", + source = source + ) + } + } + + /// ---> + + private var scriptLastRetrieval: Long? = null + private val mutex = Mutex() + private var subdomainOffsetDefault = 0 + private val subdomainOffsetMap = mutableMapOf() + private var commonImageId = "" + + private suspend fun refreshScript() = mutex.withLock { + if (scriptLastRetrieval == null || (scriptLastRetrieval!! + 60000) < System.currentTimeMillis()) { + val ggScript = webClient.httpGet("$ltnBaseUrl/gg.js?_=${System.currentTimeMillis()}").parseRaw() + + subdomainOffsetDefault = Regex("var o = (\\d)").find(ggScript)!!.groupValues[1].toInt() + val o = Regex("o = (\\d); break;").find(ggScript)!!.groupValues[1].toInt() + + subdomainOffsetMap.clear() + Regex("case (\\d+):").findAll(ggScript).forEach { + val case = it.groupValues[1].toInt() + subdomainOffsetMap[case] = o + } + + commonImageId = Regex("b: '(.+)'").find(ggScript)!!.groupValues[1] + + scriptLastRetrieval = System.currentTimeMillis() + } + } + + // m <-- gg.js + private suspend fun subdomainOffset(imageId: Int): Int { + refreshScript() + return subdomainOffsetMap[imageId] ?: subdomainOffsetDefault + } + + // b <-- gg.js + private suspend fun commonImageId(): String { + refreshScript() + return commonImageId + } + + // s <-- gg.js + private fun imageIdFromHash(hash: String): Int { + val match = Regex("(..)(.)$").find(hash) + return match!!.groupValues.let { it[2]+it[1] }.toInt(16) + } + + // real_full_path_from_hash <-- common.js + private fun thumbPathFromHash(hash: String): String { + return hash.replace(Regex("""^.*(..)(.)$"""), "$2/$1") + } +} diff --git a/src/main/kotlin/org/koitharu/kotatsu/parsers/util/Parse.kt b/src/main/kotlin/org/koitharu/kotatsu/parsers/util/Parse.kt index 342b71cf..5ebd8807 100644 --- a/src/main/kotlin/org/koitharu/kotatsu/parsers/util/Parse.kt +++ b/src/main/kotlin/org/koitharu/kotatsu/parsers/util/Parse.kt @@ -52,6 +52,12 @@ fun Response.parseRaw(): String = try { closeQuietly() } +fun Response.parseBytes(): ByteArray = try { + requireBody().bytes() +} finally { + closeQuietly() +} + /** * Convert url to relative if it is on [domain] * @return an url relative to the [domain] or absolute, if domain is mismatching