Improve some parsers

master
Koitharu 1 year ago
parent c294f5bb61
commit e874837efb
Signed by: Koitharu
GPG Key ID: 676DEE768C17A9D7

@ -58,19 +58,19 @@ afterEvaluate {
}
dependencies {
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.1'
implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2'
implementation 'com.squareup.okhttp3:okhttp:4.12.0'
implementation 'com.squareup.okio:okio:3.10.2'
api 'org.jsoup:jsoup:1.18.3'
implementation 'com.squareup.okio:okio:3.11.0'
api 'org.jsoup:jsoup:1.19.1'
implementation 'org.json:json:20240303'
implementation 'androidx.collection:collection:1.4.5'
implementation 'androidx.collection:collection:1.5.0'
ksp project(':kotatsu-parsers-ksp')
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.10.1'
testImplementation 'org.junit.jupiter:junit-jupiter-engine:5.10.1'
testImplementation 'org.junit.jupiter:junit-jupiter-params:5.10.1'
testImplementation 'org.jetbrains.kotlinx:kotlinx-coroutines-test:1.10.1'
testImplementation 'org.jetbrains.kotlinx:kotlinx-coroutines-test:1.10.2'
testImplementation 'io.webfolder:quickjs:1.1.0'
}

@ -38,6 +38,11 @@ public abstract class LegacyPagedMangaParser(
public abstract suspend fun getListPage(page: Int, order: SortOrder, filter: MangaListFilter): List<Manga>
protected fun setFirstPage(firstPage: Int, firstPageForSearch: Int = firstPage) {
paginator.firstPage = firstPage
searchPaginator.firstPage = firstPageForSearch
}
private suspend fun getList(
paginator: Paginator,
offset: Int,

@ -43,6 +43,11 @@ public abstract class PagedMangaParser(
public abstract suspend fun getListPage(query: MangaSearchQuery, page: Int): List<Manga>
protected fun setFirstPage(firstPage: Int, firstPageForSearch: Int = firstPage) {
paginator.firstPage = firstPage
searchPaginator.firstPage = firstPageForSearch
}
private suspend fun searchManga(
paginator: Paginator,
query: MangaSearchQuery,

@ -47,7 +47,7 @@ public data class MangaChapter(
get() = title.ifNullOrEmpty {
buildString {
if (volume > 0) append("Vol ").append(volume).append(' ')
if (number > 0) append("Chapter ").append(number) else append("Unnamed")
if (number > 0) append("Chapter ").append(number.formatSimple()) else append("Unnamed")
}
}

@ -1,7 +1,7 @@
package org.koitharu.kotatsu.parsers.site.en
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.json.JSONObject
import org.koitharu.kotatsu.parsers.Broken
import org.koitharu.kotatsu.parsers.MangaLoaderContext
import org.koitharu.kotatsu.parsers.MangaSourceParser
import org.koitharu.kotatsu.parsers.config.ConfigKey
@ -9,14 +9,13 @@ import org.koitharu.kotatsu.parsers.core.LegacyPagedMangaParser
import org.koitharu.kotatsu.parsers.exception.ParseException
import org.koitharu.kotatsu.parsers.model.*
import org.koitharu.kotatsu.parsers.util.*
import org.koitharu.kotatsu.parsers.util.json.getFloatOrDefault
import org.koitharu.kotatsu.parsers.util.json.getStringOrNull
import org.koitharu.kotatsu.parsers.util.suspendlazy.getOrNull
import org.koitharu.kotatsu.parsers.util.suspendlazy.suspendLazy
import java.text.SimpleDateFormat
import java.util.*
import org.json.JSONObject
import org.koitharu.kotatsu.parsers.Broken
@Broken("Need fix tags in getDetails")
@MangaSourceParser("BATCAVE", "BatCave", "en")
internal class BatCave(context: MangaLoaderContext) :
LegacyPagedMangaParser(context, MangaParserSource.BATCAVE, 20) {
@ -36,12 +35,11 @@ internal class BatCave(context: MangaLoaderContext) :
get() = MangaListFilterCapabilities(
isSearchSupported = true,
isMultipleTagsSupported = true,
isSearchWithFiltersSupported = false,
isYearRangeSupported = true
isYearRangeSupported = true,
)
override suspend fun getFilterOptions() = MangaListFilterOptions(
availableTags = availableTags.get()
availableTags = availableTags.get(),
)
override suspend fun getListPage(page: Int, order: SortOrder, filter: MangaListFilter): List<Manga> {
@ -52,6 +50,7 @@ internal class BatCave(context: MangaLoaderContext) :
urlBuilder.append(filter.query.urlEncoded())
if (page > 1) urlBuilder.append("/page/$page/")
}
else -> {
urlBuilder.append("/ComicList")
if (filter.yearFrom != YEAR_UNKNOWN) {
@ -65,20 +64,22 @@ internal class BatCave(context: MangaLoaderContext) :
urlBuilder.append(filter.tags.joinToString(",") { it.key })
}
urlBuilder.append("/sort")
if (page > 1) { urlBuilder.append("/page/$page/") }
if (page > 1) {
urlBuilder.append("/page/$page/")
}
}
}
val fullUrl = urlBuilder.toString().toAbsoluteUrl(domain)
val doc = webClient.httpGet(fullUrl).parseHtml()
return doc.select("div.readed.d-flex.short").map { item ->
val a = item.selectFirst("a.readed__img.img-fit-cover.anim")
?: throw ParseException("Link element not found!", fullUrl)
val a = item.selectFirstOrThrow("a.readed__img.img-fit-cover.anim")
val titleElement = item.selectFirstOrThrow("h2.readed__title a")
val img = item.selectFirst("img[data-src]")
val titleElement = item.selectFirst("h2.readed__title a")
val href = a.attrAsRelativeUrl("href")
Manga(
id = generateUid(a.attr("href")),
url = a.attr("href"),
id = generateUid(href),
url = href,
publicUrl = a.attr("href"),
title = titleElement.text(),
altTitles = emptySet(),
@ -87,7 +88,7 @@ internal class BatCave(context: MangaLoaderContext) :
tags = emptySet(),
rating = RATING_UNKNOWN,
state = null,
coverUrl = img.attr("data-src")?.toAbsoluteUrl(domain),
coverUrl = img?.attrAsAbsoluteUrlOrNull("data-src"),
contentRating = if (isNsfwSource) ContentRating.ADULT else null,
source = source,
)
@ -102,24 +103,22 @@ internal class BatCave(context: MangaLoaderContext) :
val scriptData = doc.selectFirst("script:containsData(__DATA__)")?.data()
?.substringAfter("window.__DATA__ = ")
?.substringBefore(";")
?: throw ParseException("Script data not found", manga.url)
?: doc.parseFailed("Script data not found")
val jsonData = JSONObject(scriptData)
val newsId = jsonData.getInt("news_id")
val newsId = jsonData.getLong("news_id")
val chaptersJson = jsonData.getJSONArray("chapters")
val chapters = (0 until chaptersJson.length()).map { i ->
val chapters = List(chaptersJson.length()) { i ->
val chapter = chaptersJson.getJSONObject(i)
val chapterId = chapter.getInt("id")
val chapterId = chapter.getLong("id")
MangaChapter(
id = generateUid("/reader/$newsId/$chapterId"),
id = generateUid("$newsId/$chapterId"),
url = "/reader/$newsId/$chapterId",
number = chapter.getInt("posi").toFloat(),
title = chapter.getString("title"),
uploadDate = runCatching {
dateFormat.parse(chapter.getString("date"))?.time
}.getOrNull() ?: 0L,
number = chapter.getFloatOrDefault("posi", 0f),
title = chapter.getStringOrNull("title"),
uploadDate = dateFormat.tryParse(chapter.getStringOrNull("date")),
source = source,
scanlator = null,
branch = null,
@ -127,24 +126,36 @@ internal class BatCave(context: MangaLoaderContext) :
)
}
val author = doc.selectFirst("li:contains(Publisher:)")?.text()?.substringAfter("Publisher:")?.trim()
val state = when (doc.selectFirst("li:contains(Release type:)")?.text()?.substringAfter("Release type:")?.trim()) {
val author = doc.selectFirst("li:contains(Publisher:)")
?.textOrNull()
?.substringAfter("Publisher:")
?.trim()
?.nullIfEmpty()
val state = when (
doc.selectFirst("li:contains(Release type:)")?.text()?.substringAfter("Release type:")?.trim()
) {
"Ongoing" -> MangaState.ONGOING
else -> MangaState.FINISHED
}
val allTags = availableTags.get()
val tags = doc.select("div.page__tags.d-flex a").mapNotNullToSet { a ->
val tagLinks = doc.getElementsByAttributeValueContaining("href", "/genres/")
val tags = if (tagLinks.isNotEmpty()) {
availableTags.getOrNull()?.let { allTags ->
tagLinks.mapNotNullToSet { a ->
val tagName = a.text()
allTags.find { it.title.equals(tagName, ignoreCase = true) }
}
}
} else {
null
}
return manga.copy(
authors = setOfNotNull(author),
state = state,
chapters = chapters,
description = doc.select("div.page__text.full-text.clearfix").text(),
tags = tags
description = doc.select("div.page__text.full-text.clearfix").textOrNull(),
tags = tags ?: manga.tags,
)
}
@ -165,15 +176,14 @@ internal class BatCave(context: MangaLoaderContext) :
id = generateUid(imageUrl),
url = imageUrl,
preview = null,
source = source
source = source,
)
}
}
private suspend fun fetchTags(): Set<MangaTag> {
val doc = webClient.httpGet("https://$domain/comix/").parseHtml()
val scriptData = doc.selectFirst("script:containsData(__XFILTER__)")?.data()
?: throw ParseException("Script data not found", "$domain/genres")
val scriptData = doc.selectFirstOrThrow("script:containsData(__XFILTER__)").data()
val genresJson = scriptData
.substringAfter("\"g\":{")
@ -182,13 +192,13 @@ internal class BatCave(context: MangaLoaderContext) :
val genresObj = JSONObject("{$genresJson}")
val valuesArray = genresObj.getJSONArray("values")
return (0 until valuesArray.length()).map { i ->
return Set(valuesArray.length()) { i ->
val genre = valuesArray.getJSONObject(i)
MangaTag(
key = genre.getInt("id").toString(),
title = genre.getString("value"),
source = source
title = genre.getString("value").toTitleCase(sourceLocale),
source = source,
)
}.toSet()
}
}
}

@ -1,17 +1,18 @@
package org.koitharu.kotatsu.parsers.site.en
import org.json.JSONArray
import androidx.collection.ArraySet
import androidx.collection.MutableIntList
import androidx.collection.MutableIntObjectMap
import org.json.JSONObject
import org.jsoup.HttpStatusException
import org.koitharu.kotatsu.parsers.MangaLoaderContext
import org.koitharu.kotatsu.parsers.MangaSourceParser
import org.koitharu.kotatsu.parsers.model.*
import org.koitharu.kotatsu.parsers.config.ConfigKey
import org.koitharu.kotatsu.parsers.core.LegacyPagedMangaParser
import org.koitharu.kotatsu.parsers.exception.ParseException
import org.koitharu.kotatsu.parsers.config.ConfigKey
import org.koitharu.kotatsu.parsers.model.*
import org.koitharu.kotatsu.parsers.network.UserAgents
import org.koitharu.kotatsu.parsers.util.*
import org.koitharu.kotatsu.parsers.util.json.*
import java.net.HttpURLConnection
import java.text.SimpleDateFormat
import java.util.*
@ -53,12 +54,10 @@ internal class Hentalk(context: MangaLoaderContext) :
isSearchSupported = true,
isMultipleTagsSupported = true,
isSearchWithFiltersSupported = true,
isAuthorSearchSupported = true
isAuthorSearchSupported = true,
)
override suspend fun getFilterOptions(): MangaListFilterOptions {
return MangaListFilterOptions( availableTags = emptySet() ) // not found any URLs for it
}
override suspend fun getFilterOptions() = MangaListFilterOptions() // not found any URLs for it
override suspend fun getListPage(page: Int, order: SortOrder, filter: MangaListFilter): List<Manga> {
val url = buildString {
@ -72,20 +71,20 @@ internal class Hentalk(context: MangaLoaderContext) :
if (!filter.author.isNullOrEmpty()) {
append("artist:\"${space2plus(filter.author)}\"")
append("+")
append('+')
}
if (filter.tags.isNotEmpty()) {
filter.tags.forEach { tag ->
append("tag:\"${space2plus(tag.key)}\"")
append("+")
append('+')
}
}
if (!filter.query.isNullOrEmpty()) {
append(space2plus(filter.query))
} else {
append("+")
append('+')
}
}
}
@ -115,28 +114,28 @@ internal class Hentalk(context: MangaLoaderContext) :
}
}
val mangaList = mutableListOf<Manga>()
val dataValues = mutableMapOf<Int, Any>()
val dataArray = json.getJSONArray("nodes")
.optJSONObject(2)
?.optJSONArray("data")
?: return emptyList()
val dataValues = MutableIntObjectMap<Any>(dataArray.length())
for (i in 0 until dataArray.length()) {
dataValues[i] = dataArray.get(i)
}
val archiveH = mutableListOf<Int>()
val archiveH = MutableIntList(dataArray.length())
for (i in 0 until dataArray.length()) {
val item = dataArray.opt(i)
if (item is JSONObject && item.has("id") && item.has("hash") &&
item.has("title") && item.has("thumbnail") && item.has("tags")) {
item.has("title") && item.has("thumbnail") && item.has("tags")
) {
archiveH.add(i)
}
}
for (tempIndex in archiveH) {
val mangaList = ArrayList<Manga>()
archiveH.forEach { tempIndex ->
val temp = dataArray.getJSONObject(tempIndex)
val idRef = temp.getInt("id")
val hashRef = temp.getInt("hash")
@ -151,7 +150,7 @@ internal class Hentalk(context: MangaLoaderContext) :
val idThumbnail = dataArray.getInt(thumbnailRef)
val tagsList = dataArray.optJSONArray(tagsRef)
val tags = mutableSetOf<MangaTag>()
val tags = ArraySet<MangaTag>()
var author: String? = null
if (tagsList != null) {
@ -161,30 +160,35 @@ internal class Hentalk(context: MangaLoaderContext) :
if (dataValues.containsKey(tagRefIndex) &&
dataValues[tagRefIndex] is JSONObject &&
(dataValues[tagRefIndex] as JSONObject).has("namespace")) {
(dataValues[tagRefIndex] as JSONObject).has("namespace")
) {
val nsObj = dataValues[tagRefIndex] as JSONObject
val nsIndex = nsObj.getInt("namespace")
val nameIndex = nsObj.getInt("name")
val nsValue = if (dataValues.containsKey(nsIndex)) dataValues[nsIndex].toString() else ""
val nameValue = if (dataValues.containsKey(nameIndex)) dataValues[nameIndex].toString() else ""
val nsValue = if (dataValues.containsKey(nsIndex)) dataValues[nsIndex].toString() else null
val nameValue =
if (dataValues.containsKey(nameIndex)) dataValues[nameIndex].toString() else null
if (nsValue == "artist") {
author = nameValue
} else if (nsValue == "tag") {
tags.add(MangaTag(
author = nameValue?.nullIfEmpty()
} else if (nsValue == "tag" && nameValue != null) {
tags.add(
MangaTag(
key = nameValue,
title = nameValue,
source = source
))
source = source,
),
)
}
}
i++
}
}
mangaList.add(Manga(
mangaList.add(
Manga(
id = generateUid(mangaId),
url = "/g/$mangaId/__data.json?x-sveltekit-invalidated=001",
publicUrl = "https://$domain/g/$mangaId",
@ -199,7 +203,8 @@ internal class Hentalk(context: MangaLoaderContext) :
contentRating = ContentRating.ADULT,
source = source,
rating = RATING_UNKNOWN,
))
),
)
}
return mangaList
@ -207,12 +212,12 @@ internal class Hentalk(context: MangaLoaderContext) :
override suspend fun getDetails(manga: Manga): Manga {
val json = webClient.httpGet(manga.url.toAbsoluteUrl(domain)).parseJson()
val mangaId = manga.url.substringAfter("/g/").substringBefore("/")
val mangaId = manga.url.substringAfter("/g/").substringBefore('/')
val dataArray = json.getJSONArray("nodes")
.optJSONObject(2)
?.optJSONArray("data")
?: return manga.copy()
?: return manga
var createdAt = ""
@ -242,7 +247,7 @@ internal class Hentalk(context: MangaLoaderContext) :
)
return manga.copy(
chapters = listOf(chapter)
chapters = listOf(chapter),
)
}
@ -292,7 +297,7 @@ internal class Hentalk(context: MangaLoaderContext) :
}
}
val imgList = mutableListOf<String>()
val imgList = ArrayList<String>(dataArray.length())
for (i in 0 until dataArray.length()) {
val item = dataArray.opt(i)
if (item is JSONObject && item.has("filename")) {
@ -324,8 +329,5 @@ internal class Hentalk(context: MangaLoaderContext) :
}
}
private fun space2plus(input: String): String {
return input.replace(" ", "+")
}
private fun space2plus(input: String): String = input.replace(' ', '+')
}

@ -1,18 +1,13 @@
package org.koitharu.kotatsu.parsers.site.vi
import org.json.JSONArray
import org.json.JSONObject
import kotlinx.coroutines.async
import kotlinx.coroutines.coroutineScope
import org.koitharu.kotatsu.parsers.MangaLoaderContext
import org.koitharu.kotatsu.parsers.MangaSourceParser
import org.koitharu.kotatsu.parsers.config.ConfigKey
import org.koitharu.kotatsu.parsers.core.LegacyPagedMangaParser
import org.koitharu.kotatsu.parsers.model.*
import org.koitharu.kotatsu.parsers.util.suspendlazy.suspendLazy
import org.koitharu.kotatsu.parsers.util.*
import org.koitharu.kotatsu.parsers.util.json.*
import java.text.SimpleDateFormat
import java.util.*
@MangaSourceParser("MIMIHENTAI", "MimiHentai", "vi", type = ContentType.HENTAI)
@ -24,21 +19,26 @@ internal class MimiHentai(context: MangaLoaderContext) :
override val availableSortOrders: Set<SortOrder> = EnumSet.of(SortOrder.UPDATED)
override suspend fun getFilterOptions() = MangaListFilterOptions(availableTags = fetchTags())
override val filterCapabilities: MangaListFilterCapabilities
get() = MangaListFilterCapabilities(
isSearchSupported = true,
isSearchWithFiltersSupported = true,
isMultipleTagsSupported = true,
isAuthorSearchSupported = true
isAuthorSearchSupported = true,
)
init {
setFirstPage(0)
}
override suspend fun getFilterOptions() = MangaListFilterOptions(availableTags = fetchTags())
override suspend fun getListPage(page: Int, order: SortOrder, filter: MangaListFilter): List<Manga> {
val url = buildString {
append("https://")
append(domain)
append("/$apiSuffix/advance-search?page=")
append(page - 1) // first page is 0, not 1
append(page)
append("&max=18") // page size, avoid rate limit
when {
!filter.query.isNullOrEmpty() -> {
@ -63,13 +63,13 @@ internal class MimiHentai(context: MangaLoaderContext) :
return parseMangaList(data)
}
private suspend fun parseMangaList(data: JSONArray): List<Manga> {
private fun parseMangaList(data: JSONArray): List<Manga> {
return data.mapJSON { jo ->
val id = jo.getLong("id")
val title = jo.getString("title")
val description = jo.getString("description")
val authors = jo.getJSONArray("authors").asTypedList<String>().mapToSet { it }
val differentNames = jo.getJSONArray("differentNames").asTypedList<String>().mapToSet { it }
val description = jo.getStringOrNull("description")
val authors = jo.getJSONArray("authors").asTypedList<String>().toSet()
val differentNames = jo.getJSONArray("differentNames").asTypedList<String>().toSet()
val state = when (description) {
"Đang Tiến Hành" -> MangaState.ONGOING
"Hoàn Thành" -> MangaState.FINISHED
@ -93,48 +93,48 @@ internal class MimiHentai(context: MangaLoaderContext) :
}
}
override suspend fun getDetails(manga: Manga): Manga = coroutineScope {
val url = "https://" + domain + manga.url
override suspend fun getDetails(manga: Manga): Manga {
val url = manga.url.toAbsoluteUrl(domain)
val json = webClient.httpGet(url).parseJson()
val relationInfo = json.getJSONObject("relationInfo")
val tags = relationInfo.getJSONArray("genres").mapJSON { jo ->
val tags = relationInfo.getJSONArray("genres").mapJSONToSet { jo ->
MangaTag(
title = jo.getString("name"),
title = jo.getString("name").toTitleCase(sourceLocale),
key = jo.getLong("id").toString(),
source = source,
)
}.toSet()
}
val basicInfo = json.getJSONObject("basicInfo")
val id = basicInfo.getLong("id")
val description = basicInfo.optString("fdescription").takeUnless { it.isNullOrEmpty() }
val uploaderName = json.getString("uploaderName")
val description = basicInfo.getStringOrNull("fdescription")
val uploaderName = json.getStringOrNull("uploaderName")
val urlChaps = "https://$domain/$apiSuffix/gallery/$id"
val parseUrlChaps = async { JSONArray(webClient.httpGet(urlChaps).parseHtml().text()) }
val chapters = parseUrlChaps.await().mapJSON { jo ->
val parsedChapters = webClient.httpGet(urlChaps).parseJsonArray()
val chapters = parsedChapters.mapJSON { jo ->
MangaChapter(
id = generateUid(jo.getLong("id")),
title = jo.getString("title"),
number = jo.getInt("number").toFloat(),
title = jo.getStringOrNull("title"),
number = jo.getFloatOrDefault("number", 0f),
url = "/$apiSuffix/chapter?id=${jo.getLong("id")}",
uploadDate = 0L,
source = source,
scanlator = uploaderName,
branch = null,
volume = 0
volume = 0,
)
}
manga.copy(
return manga.copy(
tags = tags,
description = description,
chapters = chapters
chapters = chapters,
)
}
override suspend fun getPages(chapter: MangaChapter): List<MangaPage> {
val json = webClient.httpGet("https://$domain${chapter.url}").parseJson()
val json = webClient.httpGet(chapter.url.toAbsoluteUrl(domain)).parseJson()
val imageUrls = json.getJSONArray("pages").asTypedList<String>()
return imageUrls.map { url ->
MangaPage(
@ -148,13 +148,13 @@ internal class MimiHentai(context: MangaLoaderContext) :
private suspend fun fetchTags(): Set<MangaTag> {
val url = "https://$domain/$apiSuffix/genres"
val response = JSONArray(webClient.httpGet(url).parseHtml().text())
return response.mapJSON { jo ->
val response = webClient.httpGet(url).parseJsonArray()
return response.mapJSONToSet { jo ->
MangaTag(
title = jo.getString("name"),
title = jo.getString("name").toTitleCase(sourceLocale),
key = jo.getLong("id").toString(),
source = source,
)
}.toSet()
}
}
}

@ -154,7 +154,7 @@ internal class MangaParserTest {
val parser = context.newParserInstance(source)
val list = parser.getList(MangaSearchQuery.EMPTY)
val manga = list[0]
val manga = list.random()
parser.getDetails(manga).apply {
assert(!chapters.isNullOrEmpty()) { "Chapters are null or empty" }
assert(publicUrl.isUrlAbsolute()) { "Manga public url is not absolute: '$publicUrl'" }

Loading…
Cancel
Save