[Grouple] Selecting the fastest pages server

pull/154/head
Koitharu 3 years ago
parent e51b33c74a
commit 454b24ec88
No known key found for this signature in database
GPG Key ID: 8E861F8CE6E7CE27

@ -1,5 +1,7 @@
package org.koitharu.kotatsu.parsers.site.grouple
import kotlinx.coroutines.async
import kotlinx.coroutines.coroutineScope
import okhttp3.Headers
import okhttp3.HttpUrl.Companion.toHttpUrl
import okhttp3.Response
@ -21,298 +23,306 @@ private const val NSFW_ALERT = "сексуальные сцены"
private const val NOTHING_FOUND = "Ничего не найдено"
internal abstract class GroupleParser(
context: MangaLoaderContext,
source: MangaSource,
userAgent: String,
private val siteId: Int,
context: MangaLoaderContext,
source: MangaSource,
userAgent: String,
private val siteId: Int,
) : MangaParser(context, source), MangaParserAuthProvider {
override val headers = Headers.Builder()
.add("User-Agent", userAgent)
.build()
@Volatile
private var cachedPagesServer: String? = null
override val sortOrders: Set<SortOrder> = EnumSet.of(
SortOrder.UPDATED,
SortOrder.POPULARITY,
SortOrder.NEWEST,
SortOrder.RATING,
)
override val headers = Headers.Builder()
.add("User-Agent", userAgent)
.build()
override val authUrl: String
get() {
val targetUri = "https://${domain}/".urlEncoded()
return "https://grouple.co/internal/auth/sso?siteId=$siteId&=targetUri=$targetUri"
}
override val sortOrders: Set<SortOrder> = EnumSet.of(
SortOrder.UPDATED,
SortOrder.POPULARITY,
SortOrder.NEWEST,
SortOrder.RATING,
)
override val isAuthorized: Boolean
get() = context.cookieJar.getCookies(domain).any { it.name == "gwt" }
override val authUrl: String
get() {
val targetUri = "https://${domain}/".urlEncoded()
return "https://grouple.co/internal/auth/sso?siteId=$siteId&=targetUri=$targetUri"
}
override suspend fun getList(
offset: Int,
query: String?,
tags: Set<MangaTag>?,
sortOrder: SortOrder,
): List<Manga> {
val domain = domain
val doc = when {
!query.isNullOrEmpty() -> webClient.httpPost(
"https://$domain/search",
mapOf(
"q" to query.urlEncoded(),
"offset" to (offset upBy PAGE_SIZE_SEARCH).toString(),
),
)
override val isAuthorized: Boolean
get() = context.cookieJar.getCookies(domain).any { it.name == "gwt" }
tags.isNullOrEmpty() -> webClient.httpGet(
"https://$domain/list?sortType=${
getSortKey(sortOrder)
}&offset=${offset upBy PAGE_SIZE}",
)
override suspend fun getList(
offset: Int,
query: String?,
tags: Set<MangaTag>?,
sortOrder: SortOrder,
): List<Manga> {
val domain = domain
val doc = when {
!query.isNullOrEmpty() -> webClient.httpPost(
"https://$domain/search",
mapOf(
"q" to query.urlEncoded(),
"offset" to (offset upBy PAGE_SIZE_SEARCH).toString(),
),
)
tags.size == 1 -> webClient.httpGet(
"https://$domain/list/genre/${tags.first().key}?sortType=${
getSortKey(sortOrder)
}&offset=${offset upBy PAGE_SIZE}",
)
tags.isNullOrEmpty() -> webClient.httpGet(
"https://$domain/list?sortType=${
getSortKey(sortOrder)
}&offset=${offset upBy PAGE_SIZE}",
)
offset > 0 -> return emptyList()
else -> advancedSearch(domain, tags)
}.parseHtml().body()
val root = (doc.getElementById("mangaBox") ?: doc.getElementById("mangaResults"))
?: doc.parseFailed("Cannot find root")
val tiles = root.selectFirst("div.tiles.row") ?: if (
root.select(".alert").any { it.ownText() == NOTHING_FOUND }
) {
return emptyList()
} else {
doc.parseFailed("No tiles found")
}
val baseHost = root.baseUri().toHttpUrl().host
return tiles.select("div.tile").mapNotNull { node ->
val imgDiv = node.selectFirst("div.img") ?: return@mapNotNull null
val descDiv = node.selectFirst("div.desc") ?: return@mapNotNull null
if (descDiv.selectFirst("i.fa-user") != null) {
return@mapNotNull null // skip author
}
val href = imgDiv.selectFirst("a")?.attrAsAbsoluteUrlOrNull("href")
if (href == null || href.toHttpUrl().host != baseHost) {
return@mapNotNull null // skip external links
}
val title = descDiv.selectFirst("h3")?.selectFirst("a")?.text()
?: return@mapNotNull null
val tileInfo = descDiv.selectFirst("div.tile-info")
val relUrl = href.toRelativeUrl(baseHost)
Manga(
id = generateUid(relUrl),
url = relUrl,
publicUrl = href,
title = title,
altTitle = descDiv.selectFirst("h4")?.text(),
coverUrl = imgDiv.selectFirst("img.lazy")?.attr("data-original")?.replace("_p.", ".").orEmpty(),
rating = runCatching {
node.selectFirst(".compact-rate")
?.attr("title")
?.toFloatOrNull()
?.div(5f)
}.getOrNull() ?: RATING_UNKNOWN,
author = tileInfo?.selectFirst("a.person-link")?.text(),
isNsfw = false,
tags = runCatching {
tileInfo?.select("a.element-link")
?.mapToSet {
MangaTag(
title = it.text().toTitleCase(),
key = it.attr("href").substringAfterLast('/'),
source = source,
)
}
}.getOrNull().orEmpty(),
state = when {
node.selectFirst("div.tags")
?.selectFirst("span.mangaCompleted") != null -> MangaState.FINISHED
tags.size == 1 -> webClient.httpGet(
"https://$domain/list/genre/${tags.first().key}?sortType=${
getSortKey(sortOrder)
}&offset=${offset upBy PAGE_SIZE}",
)
else -> null
},
source = source,
)
}
}
offset > 0 -> return emptyList()
else -> advancedSearch(domain, tags)
}.parseHtml().body()
val root = (doc.getElementById("mangaBox") ?: doc.getElementById("mangaResults"))
?: doc.parseFailed("Cannot find root")
val tiles = root.selectFirst("div.tiles.row") ?: if (
root.select(".alert").any { it.ownText() == NOTHING_FOUND }
) {
return emptyList()
} else {
doc.parseFailed("No tiles found")
}
val baseHost = root.baseUri().toHttpUrl().host
return tiles.select("div.tile").mapNotNull { node ->
val imgDiv = node.selectFirst("div.img") ?: return@mapNotNull null
val descDiv = node.selectFirst("div.desc") ?: return@mapNotNull null
if (descDiv.selectFirst("i.fa-user") != null) {
return@mapNotNull null // skip author
}
val href = imgDiv.selectFirst("a")?.attrAsAbsoluteUrlOrNull("href")
if (href == null || href.toHttpUrl().host != baseHost) {
return@mapNotNull null // skip external links
}
val title = descDiv.selectFirst("h3")?.selectFirst("a")?.text()
?: return@mapNotNull null
val tileInfo = descDiv.selectFirst("div.tile-info")
val relUrl = href.toRelativeUrl(baseHost)
Manga(
id = generateUid(relUrl),
url = relUrl,
publicUrl = href,
title = title,
altTitle = descDiv.selectFirst("h4")?.text(),
coverUrl = imgDiv.selectFirst("img.lazy")?.attr("data-original")?.replace("_p.", ".").orEmpty(),
rating = runCatching {
node.selectFirst(".compact-rate")
?.attr("title")
?.toFloatOrNull()
?.div(5f)
}.getOrNull() ?: RATING_UNKNOWN,
author = tileInfo?.selectFirst("a.person-link")?.text(),
isNsfw = false,
tags = runCatching {
tileInfo?.select("a.element-link")
?.mapToSet {
MangaTag(
title = it.text().toTitleCase(),
key = it.attr("href").substringAfterLast('/'),
source = source,
)
}
}.getOrNull().orEmpty(),
state = when {
node.selectFirst("div.tags")
?.selectFirst("span.mangaCompleted") != null -> MangaState.FINISHED
override suspend fun getDetails(manga: Manga): Manga {
val doc = webClient.httpGet(manga.url.toAbsoluteUrl(domain)).checkAuthRequired().parseHtml()
val root = doc.body().getElementById("mangaBox")?.selectFirst("div.leftContent")
?: doc.parseFailed("Cannot find root")
val dateFormat = SimpleDateFormat("dd.MM.yy", Locale.US)
val coverImg = root.selectFirst("div.subject-cover")?.selectFirst("img")
return manga.copy(
description = root.selectFirst("div.manga-description")?.html(),
largeCoverUrl = coverImg?.attr("data-full"),
coverUrl = coverImg?.attr("data-thumb") ?: manga.coverUrl,
tags = manga.tags + root.select("div.subject-meta").select("span.elem_genre ")
.mapNotNull {
val a = it.selectFirst("a.element-link") ?: return@mapNotNull null
MangaTag(
title = a.text().toTitleCase(),
key = a.attr("href").substringAfterLast('/'),
source = source,
)
},
author = root.selectFirst("a.person-link")?.text() ?: manga.author,
isNsfw = root.select(".alert-warning").any { it.ownText().contains(NSFW_ALERT) },
chapters = root.selectFirst("div.chapters-link")?.selectFirst("table")
?.select("tr:has(td > a)")?.asReversed()?.mapChapters { i, tr ->
val a = tr.selectFirst("a.chapter-link") ?: return@mapChapters null
val href = a.attrAsRelativeUrl("href")
var translators = ""
val translatorElement = a.attr("title")
if (!translatorElement.isNullOrBlank()) {
translators = translatorElement
.replace("(Переводчик),", "&")
.removeSuffix(" (Переводчик)")
}
MangaChapter(
id = generateUid(href),
name = tr.selectFirst("a")?.text().orEmpty().removePrefix(manga.title).trim(),
number = i + 1,
url = href,
uploadDate = dateFormat.tryParse(tr.selectFirst("td.date")?.text()),
scanlator = translators,
source = source,
branch = null,
)
},
)
}
else -> null
},
source = source,
)
}
}
override suspend fun getPages(chapter: MangaChapter): List<MangaPage> {
val doc = webClient.httpGet(chapter.url.toAbsoluteUrl(domain) + "?mtr=1")
.checkAuthRequired()
.parseHtml()
val scripts = doc.select("script")
for (script in scripts) {
val data = script.html()
val pos = data.indexOf("rm_h.readerInit( 0,")
if (pos == -1) {
continue
}
val json = data.substring(pos)
.substringAfter('(')
.substringBefore('\n')
.substringBeforeLast(')')
if (json.isEmpty()) {
continue
}
val ja = JSONArray("[$json]")
val pages = ja.getJSONArray(1)
val servers = ja.getJSONArray(3).mapJSON { it.getString("path") }
val serversStr = servers.joinToString("|")
return (0 until pages.length()).map { i ->
val page = pages.getJSONArray(i)
val primaryServer = page.getString(0)
val url = page.getString(2)
MangaPage(
id = generateUid(url),
url = "$primaryServer|$serversStr|$url",
preview = null,
referer = chapter.url,
source = source,
)
}
}
doc.parseFailed("Pages list not found at ${chapter.url}")
}
override suspend fun getDetails(manga: Manga): Manga {
val doc = webClient.httpGet(manga.url.toAbsoluteUrl(domain)).checkAuthRequired().parseHtml()
val root = doc.body().getElementById("mangaBox")?.selectFirst("div.leftContent")
?: doc.parseFailed("Cannot find root")
val dateFormat = SimpleDateFormat("dd.MM.yy", Locale.US)
val coverImg = root.selectFirst("div.subject-cover")?.selectFirst("img")
return manga.copy(
description = root.selectFirst("div.manga-description")?.html(),
largeCoverUrl = coverImg?.attr("data-full"),
coverUrl = coverImg?.attr("data-thumb") ?: manga.coverUrl,
tags = manga.tags + root.select("div.subject-meta").select("span.elem_genre ")
.mapNotNull {
val a = it.selectFirst("a.element-link") ?: return@mapNotNull null
MangaTag(
title = a.text().toTitleCase(),
key = a.attr("href").substringAfterLast('/'),
source = source,
)
},
author = root.selectFirst("a.person-link")?.text() ?: manga.author,
isNsfw = root.select(".alert-warning").any { it.ownText().contains(NSFW_ALERT) },
chapters = root.selectFirst("div.chapters-link")?.selectFirst("table")
?.select("tr:has(td > a)")?.asReversed()?.mapChapters { i, tr ->
val a = tr.selectFirst("a.chapter-link") ?: return@mapChapters null
val href = a.attrAsRelativeUrl("href")
var translators = ""
val translatorElement = a.attr("title")
if (!translatorElement.isNullOrBlank()) {
translators = translatorElement
.replace("(Переводчик),", "&")
.removeSuffix(" (Переводчик)")
}
MangaChapter(
id = generateUid(href),
name = tr.selectFirst("a")?.text().orEmpty().removePrefix(manga.title).trim(),
number = i + 1,
url = href,
uploadDate = dateFormat.tryParse(tr.selectFirst("td.date")?.text()),
scanlator = translators,
source = source,
branch = null,
)
},
)
}
override suspend fun getPageUrl(page: MangaPage): String {
val parts = page.url.split('|')
val path = parts.last()
val servers = parts.dropLast(1).toSet()
for (server in servers) {
val url = server + path
if (tryHead(url)) {
return url
}
}
val fallbackServer = servers.firstOrNull() ?: throw ParseException("Cannot find any page url", page.url)
return fallbackServer + path
}
override suspend fun getPages(chapter: MangaChapter): List<MangaPage> {
val doc = webClient.httpGet(chapter.url.toAbsoluteUrl(domain) + "?mtr=1")
.checkAuthRequired()
.parseHtml()
val scripts = doc.select("script")
for (script in scripts) {
val data = script.html()
val pos = data.indexOf("rm_h.readerInit( 0,")
if (pos == -1) {
continue
}
val json = data.substring(pos)
.substringAfter('(')
.substringBefore('\n')
.substringBeforeLast(')')
if (json.isEmpty()) {
continue
}
val ja = JSONArray("[$json]")
val pages = ja.getJSONArray(1)
val servers = ja.getJSONArray(3).mapJSON { it.getString("path") }
val serversStr = servers.joinToString("|")
return (0 until pages.length()).map { i ->
val page = pages.getJSONArray(i)
val primaryServer = page.getString(0)
val url = page.getString(2)
MangaPage(
id = generateUid(url),
url = "$primaryServer|$serversStr|$url",
preview = null,
referer = chapter.url,
source = source,
)
}
}
doc.parseFailed("Pages list not found at ${chapter.url}")
}
override suspend fun getTags(): Set<MangaTag> {
val doc = webClient.httpGet("https://${domain}/list/genres/sort_name").parseHtml()
val root = doc.body().getElementById("mangaBox")?.selectFirst("div.leftContent")
?.selectFirst("table.table") ?: doc.parseFailed("Cannot find root")
return root.select("a.element-link").mapToSet { a ->
MangaTag(
title = a.text().toTitleCase(),
key = a.attr("href").substringAfterLast('/'),
source = source,
)
}
}
override suspend fun getPageUrl(page: MangaPage): String {
val parts = page.url.split('|')
val path = parts.last()
val servers = parts.dropLast(1).toSet()
val cachedServer = cachedPagesServer
if (cachedServer != null && cachedServer in servers && tryHead(cachedServer + path)) {
return cachedServer + path
}
val server = coroutineScope {
servers.map { server ->
async {
if (tryHead(server + path)) server else null
}
}.awaitFirst { it != null }
}
cachedPagesServer = server
return checkNotNull(server + path)
}
override suspend fun getUsername(): String {
val root = webClient.httpGet("https://grouple.co/").parseHtml().body()
val element = root.selectFirst("img.user-avatar") ?: throw AuthRequiredException(source)
val res = element.parent()?.text()
return if (res.isNullOrEmpty()) {
root.parseFailed("Cannot find username")
} else res
}
override suspend fun getTags(): Set<MangaTag> {
val doc = webClient.httpGet("https://${domain}/list/genres/sort_name").parseHtml()
val root = doc.body().getElementById("mangaBox")?.selectFirst("div.leftContent")
?.selectFirst("table.table") ?: doc.parseFailed("Cannot find root")
return root.select("a.element-link").mapToSet { a ->
MangaTag(
title = a.text().toTitleCase(),
key = a.attr("href").substringAfterLast('/'),
source = source,
)
}
}
private fun getSortKey(sortOrder: SortOrder) =
when (sortOrder) {
SortOrder.ALPHABETICAL -> "name"
SortOrder.POPULARITY -> "rate"
SortOrder.UPDATED -> "updated"
SortOrder.NEWEST -> "created"
SortOrder.RATING -> "votes"
}
override suspend fun getUsername(): String {
val root = webClient.httpGet("https://grouple.co/").parseHtml().body()
val element = root.selectFirst("img.user-avatar") ?: throw AuthRequiredException(source)
val res = element.parent()?.text()
return if (res.isNullOrEmpty()) {
root.parseFailed("Cannot find username")
} else res
}
private suspend fun advancedSearch(domain: String, tags: Set<MangaTag>): Response {
val url = "https://$domain/search/advanced"
// Step 1: map catalog genres names to advanced-search genres ids
val tagsIndex = webClient.httpGet(url).parseHtml()
.body().selectFirst("form.search-form")
?.select("div.form-group")
?.get(1) ?: throw ParseException("Genres filter element not found", url)
val tagNames = tags.map { it.title.lowercase() }
val payload = HashMap<String, String>()
var foundGenres = 0
tagsIndex.select("li.property").forEach { li ->
val name = li.text().trim().lowercase()
val id = li.selectFirst("input")?.id()
?: li.parseFailed("Id for tag $name not found")
payload[id] = if (name in tagNames) {
foundGenres++
"in"
} else ""
}
if (foundGenres != tags.size) {
tagsIndex.parseFailed("Some genres are not found")
}
// Step 2: advanced search
payload["q"] = ""
payload["s_high_rate"] = ""
payload["s_single"] = ""
payload["s_mature"] = ""
payload["s_completed"] = ""
payload["s_translated"] = ""
payload["s_many_chapters"] = ""
payload["s_wait_upload"] = ""
payload["s_sale"] = ""
payload["years"] = "1900,2099"
payload["+"] = "Искать".urlEncoded()
return webClient.httpPost(url, payload)
}
private fun getSortKey(sortOrder: SortOrder) =
when (sortOrder) {
SortOrder.ALPHABETICAL -> "name"
SortOrder.POPULARITY -> "rate"
SortOrder.UPDATED -> "updated"
SortOrder.NEWEST -> "created"
SortOrder.RATING -> "votes"
}
private suspend fun tryHead(url: String): Boolean = runCatchingCancellable {
webClient.httpHead(url).isSuccessful
}.getOrDefault(false)
private suspend fun advancedSearch(domain: String, tags: Set<MangaTag>): Response {
val url = "https://$domain/search/advanced"
// Step 1: map catalog genres names to advanced-search genres ids
val tagsIndex = webClient.httpGet(url).parseHtml()
.body().selectFirst("form.search-form")
?.select("div.form-group")
?.get(1) ?: throw ParseException("Genres filter element not found", url)
val tagNames = tags.map { it.title.lowercase() }
val payload = HashMap<String, String>()
var foundGenres = 0
tagsIndex.select("li.property").forEach { li ->
val name = li.text().trim().lowercase()
val id = li.selectFirst("input")?.id()
?: li.parseFailed("Id for tag $name not found")
payload[id] = if (name in tagNames) {
foundGenres++
"in"
} else ""
}
if (foundGenres != tags.size) {
tagsIndex.parseFailed("Some genres are not found")
}
// Step 2: advanced search
payload["q"] = ""
payload["s_high_rate"] = ""
payload["s_single"] = ""
payload["s_mature"] = ""
payload["s_completed"] = ""
payload["s_translated"] = ""
payload["s_many_chapters"] = ""
payload["s_wait_upload"] = ""
payload["s_sale"] = ""
payload["years"] = "1900,2099"
payload["+"] = "Искать".urlEncoded()
return webClient.httpPost(url, payload)
}
private fun Response.checkAuthRequired(): Response {
val lastPathSegment = request.url.pathSegments.lastOrNull() ?: return this
if (lastPathSegment == "login") {
throw AuthRequiredException(source)
}
return this
}
private suspend fun tryHead(url: String): Boolean = runCatchingCancellable {
webClient.httpHead(url).isSuccessful
}.getOrDefault(false)
private fun Response.checkAuthRequired(): Response {
val lastPathSegment = request.url.pathSegments.lastOrNull() ?: return this
if (lastPathSegment == "login") {
throw AuthRequiredException(source)
}
return this
}
}

@ -0,0 +1,40 @@
package org.koitharu.kotatsu.parsers.util
import kotlinx.coroutines.Deferred
import kotlinx.coroutines.Job
import kotlinx.coroutines.selects.select
import kotlin.coroutines.cancellation.CancellationException
fun Iterable<Job>.cancelAll(cause: CancellationException? = null) {
forEach { it.cancel(cause) }
}
suspend fun <T> Iterable<Deferred<T>>.awaitFirst(): T = select<T> {
for (async in this@awaitFirst) {
async.onAwait { it }
}
}.also { this@awaitFirst.cancelAll() }
suspend fun <T> Collection<Deferred<T>>.awaitFirst(condition: (T) -> Boolean): T {
var result: Any? = NULL
var counter = size
while (result === NULL && counter > 0) {
val candidate = select<T> {
for (async in this@awaitFirst) {
async.onAwait { it }
}
}
if (condition(candidate)) {
result = candidate
}
counter--
}
cancelAll()
if (result === NULL) {
throw NoSuchElementException()
}
@Suppress("UNCHECKED_CAST")
return result as T
}
private val NULL = Any()
Loading…
Cancel
Save