Skip to content

Commit

Permalink
Major refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
spypunk committed Mar 14, 2020
1 parent 3698a44 commit 286a3ba
Show file tree
Hide file tree
Showing 11 changed files with 282 additions and 308 deletions.
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ Options:
-t, --mime-type TEXT Mime types to download (example: text/plain)
-e, --file-extension TEXT Extensions to download (example: png)
-d, --depth INT Search depth (default: 1)
-m, --max-uris INT Maximum uris to process (default:
2147483647)
-m, --max-uris INT Maximum uris to visit (default: 1000000)
-s, --include-subdomains Include subdomains
-R, --concurrent-requests INT Concurrent requests (default: 1)
-D, --concurrent-downloads INT Concurrent downloads (default: 1)
-r, --referrer TEXT Referrer (default: https://www.google.com)
-U, --user-agent TEXT User agent (default: Mozilla/5.0 (X11; Linux
x86_64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/80.0.3987.122 Safari/537.36)
Gecko) Chrome/80.0.3987.132 Safari/537.36)
-O, --overwrite Overwrite existing files
-v, --version Show the version and exit
-h, --help Show this message and exit
Expand Down
7 changes: 6 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@
<limit>
<counter>BRANCH</counter>
<value>COVEREDRATIO</value>
<minimum>0.7</minimum>
<minimum>0.8</minimum>
</limit>
</limits>
</rule>
Expand Down Expand Up @@ -250,6 +250,11 @@
<artifactId>kotlin-stdlib-jdk8</artifactId>
<version>${kotlin.version}</version>
</dependency>
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-stdlib-common</artifactId>
<version>${kotlin.version}</version>
</dependency>
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-reflect</artifactId>
Expand Down
129 changes: 51 additions & 78 deletions src/main/kotlin/spypunk/sponge/Sponge.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,85 +20,69 @@ import org.apache.http.entity.ContentType
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.nio.file.Files
import java.nio.file.Path
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicInteger

private val htmlMimeTypes = setOf(ContentType.TEXT_HTML.mimeType, ContentType.APPLICATION_XHTML_XML.mimeType)

private fun String.isHtmlMimeType() = htmlMimeTypes.contains(this)
val Throwable.rootMessage: String
get() = ExceptionUtils.getRootCauseMessage(this)

private fun Element.toSpongeUri(attributeKey: String): SpongeUri? {
return try {
attr(attributeKey)?.toSpongeUri()
} catch (ignored: Exception) {
null
}
}

fun Throwable.rootMessage(): String = ExceptionUtils.getRootCauseMessage(this)

class Sponge(private val spongeService: SpongeService, private val spongeInput: SpongeInput) {
private val requestContext = newFixedThreadPoolContext(spongeInput.concurrentRequests, "request")
private val downloadContext = newFixedThreadPoolContext(spongeInput.concurrentDownloads, "download")
class Sponge(private val spongeService: SpongeService, private val spongeConfig: SpongeConfig) {
private val requestContext = newFixedThreadPoolContext(spongeConfig.concurrentRequests, "request")
private val downloadContext = newFixedThreadPoolContext(spongeConfig.concurrentDownloads, "download")
private val spongeUris = ConcurrentHashMap<String, SpongeUri>()
private val visitedCount = AtomicInteger()

fun execute() = runBlocking {
visit()
}
fun execute() = runBlocking { visit() }

private suspend fun visit(spongeUri: SpongeUri = spongeConfig.spongeUri, parents: Set<SpongeUri> = setOf()) {
if (visitedCount.incrementAndGet() > spongeConfig.maximumUris) return

private suspend fun visit(spongeUri: SpongeUri = spongeInput.spongeUri, parents: Set<SpongeUri> = setOf()) {
try {
downloadOrVisitChildren(
spongeUris.computeIfAbsent(spongeUri.id) { spongeUri },
parents
)
spongeUris.computeIfAbsent(spongeUri.uri) {
initialize(spongeUri)
}.let {
downloadOrVisitChildren(it, parents)
}
} catch (e: Exception) {
System.err.println("⚠ Processing failed for $spongeUri: ${e.rootMessage()}")
System.err.println("⚠ Processing failed for $spongeUri: ${e.rootMessage}")
}
}

private suspend fun downloadOrVisitChildren(spongeUri: SpongeUri, parents: Set<SpongeUri>) {
if (spongeUris.size > spongeInput.maximumUris) return

var download = false

synchronized(spongeUri) {
if (!spongeUri.visited) {
spongeUri.visited = true
private fun initialize(spongeUri: SpongeUri): SpongeUri {
val extension = FilenameUtils.getExtension(spongeUri.path)

if (isDownloadableByExtension(spongeUri)) {
download = true
} else {
val response = spongeService.request(spongeUri)
val mimeType = ContentType.parse(response.contentType()).mimeType
if (spongeConfig.fileExtensions.contains(extension)) {
spongeUri.download = true
} else {
val response = spongeService.request(spongeUri)
val mimeType = ContentType.parse(response.contentType()).mimeType

if (mimeType.isHtmlMimeType()) {
val document = Jsoup.parse(response.body(), response.url().toExternalForm())
if (htmlMimeTypes.contains(mimeType)) {
val document = Jsoup.parse(response.body(), response.url().toExternalForm())

spongeUri.children = getChildren(document, spongeUri)
} else if (spongeInput.mimeTypes.contains(mimeType)) {
download = true
}
}
spongeUri.children = getChildren(document, spongeUri)
} else if (spongeConfig.mimeTypes.contains(mimeType)) {
spongeUri.download = true
}
}

if (download) {
return spongeUri
}

private suspend fun downloadOrVisitChildren(spongeUri: SpongeUri, parents: Set<SpongeUri>) {
if (spongeUri.download) {
download(spongeUri)
} else if (parents.size < spongeInput.maximumDepth && spongeUri.children.isNotEmpty()) {
} else if (spongeUri.children.isNotEmpty() && parents.size < spongeConfig.maximumDepth) {
visit(spongeUri.children, parents + spongeUri)
}
}

private suspend fun download(spongeUri: SpongeUri) {
val path = getDownloadPath(spongeUri)
spongeUri.download = false

if (!spongeInput.overwriteExistingFiles && Files.exists(path)) {
println("$path")
} else {
withContext(downloadContext) { spongeService.download(spongeUri, path) }
}
withContext(downloadContext) { spongeService.download(spongeUri) }
}

private suspend fun visit(spongeUris: Set<SpongeUri>, parents: Set<SpongeUri>) {
Expand All @@ -111,43 +95,32 @@ class Sponge(private val spongeService: SpongeService, private val spongeInput:
val children = getHrefChildren(document) + getImgChildren(document)

return children.distinct()
.filter { isVisitable(it, parent) }
.filter { it != parent && isHostVisitable(it.host) }
.toSet()
}

private fun getHrefChildren(document: Document) = getChildren(document, "a[href]", "abs:href")

private fun getImgChildren(document: Document) = getChildren(document, "img[src]", "abs:src")

private fun getChildren(
document: Document,
cssQuery: String,
attributeKey: String
): Sequence<SpongeUri> {
private fun getChildren(document: Document, cssQuery: String, attributeKey: String): Sequence<SpongeUri> {
return document.select(cssQuery)
.asSequence()
.mapNotNull { it.toSpongeUri(attributeKey) }
.mapNotNull { toSpongeUri(it, attributeKey) }
}

private fun isVisitable(spongeUri: SpongeUri, parent: SpongeUri) =
spongeUri != parent && isHostVisitable(spongeUri.host)

private fun isHostVisitable(host: String): Boolean {
return host == spongeInput.spongeUri.host ||
spongeInput.includeSubdomains && host.endsWith(spongeInput.spongeUri.host)
}

private fun isDownloadableByExtension(spongeUri: SpongeUri): Boolean {
val extension = FilenameUtils.getExtension(spongeUri.path)

return spongeInput.fileExtensions.contains(extension)
private fun toSpongeUri(element: Element, attributeKey: String): SpongeUri? {
return try {
element.attr(attributeKey)?.let {
SpongeUri(it)
}
} catch (ignored: Exception) {
null
}
}

private fun getDownloadPath(spongeUri: SpongeUri): Path {
return spongeInput.outputDirectory
.resolve(spongeUri.host)
.resolve(FilenameUtils.getPath(spongeUri.path))
.resolve(FilenameUtils.getName(spongeUri.path))
.toAbsolutePath()
private fun isHostVisitable(host: String): Boolean {
return host == spongeConfig.spongeUri.host ||
spongeConfig.includeSubdomains && host.endsWith(spongeConfig.spongeUri.host)
}
}
40 changes: 21 additions & 19 deletions src/main/kotlin/spypunk/sponge/SpongeCommand.kt
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,13 @@ import com.natpryce.konfig.stringType
import java.util.regex.Pattern
import kotlin.system.exitProcess

private const val DEFAULT_REFERRER = "https://www.google.com"
private const val DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/80.0.3987.122 Safari/537.36"

private val mimeTypePattern = Pattern.compile("^[-\\w.]+/[-\\w.]+\$")
private val version = ConfigurationProperties
.fromResource("sponge.properties")[Key("version", stringType)]

class SpongeCommand : CliktCommand(name = "sponge", printHelpOnEmptyArgs = true) {
private val spongeUri by option("-u", "--uri", help = "URI (example: https://www.google.com)")
.convert { it.toSpongeUri() }
.convert { SpongeUri(it) }
.required()

private val outputDirectory by option("-o", "--output", help = "Output directory where files are downloaded")
Expand All @@ -60,25 +56,25 @@ class SpongeCommand : CliktCommand(name = "sponge", printHelpOnEmptyArgs = true)
private val maximumDepth by option("-d", "--depth", help = "Search depth")
.int()
.restrictTo(1)
.default(1)
.default(DEFAULT_MAXIMUM_DEPTH)

private val maximumUris by option("-m", "--max-uris", help = "Maximum uris to process")
private val maximumUris by option("-m", "--max-uris", help = "Maximum uris to visit")
.int()
.restrictTo(1)
.default(Int.MAX_VALUE)
.default(DEFAULT_MAXIMUM_URIS)

private val includeSubdomains by option("-s", "--include-subdomains", help = "Include subdomains")
.flag()

private val concurrentRequests by option("-R", "--concurrent-requests", help = "Concurrent requests")
.int()
.restrictTo(1)
.default(1)
.default(DEFAULT_CONCURRENT_REQUESTS)

private val concurrentDownloads by option("-D", "--concurrent-downloads", help = "Concurrent downloads")
.int()
.restrictTo(1)
.default(1)
.default(DEFAULT_CONCURRENT_DOWNLOADS)

private val referrer by option("-r", "--referrer", help = "Referrer")
.default(DEFAULT_REFERRER)
Expand All @@ -87,7 +83,7 @@ class SpongeCommand : CliktCommand(name = "sponge", printHelpOnEmptyArgs = true)
.default(DEFAULT_USER_AGENT)

private val overwriteExistingFiles by option("-O", "--overwrite", help = "Overwrite existing files")
.flag(default = false)
.flag(default = DEFAULT_OVERWRITE_EXISTING_FILES)

init {
versionOption(names = setOf("-v", "--version"), version = version) { it }
Expand All @@ -103,23 +99,29 @@ class SpongeCommand : CliktCommand(name = "sponge", printHelpOnEmptyArgs = true)
}

try {
val spongeInput = SpongeInput(
spongeUri,
val spongeServiceConfig = SpongeServiceConfig(
outputDirectory,
referrer,
userAgent,
overwriteExistingFiles
)

val spongeService = SpongeService(spongeServiceConfig)

val spongeConfig = SpongeConfig(
spongeUri,
mimeTypes.toSet(),
fileExtensions.toSet(),
maximumDepth,
maximumUris,
includeSubdomains,
concurrentRequests,
concurrentDownloads,
overwriteExistingFiles)

val spongeService = SpongeService(referrer, userAgent)
concurrentDownloads
)

Sponge(spongeService, spongeInput).execute()
Sponge(spongeService, spongeConfig).execute()
} catch (t: Throwable) {
System.err.println("Unexpected error encountered: : ${t.rootMessage()}")
System.err.println("Unexpected error encountered: : ${t.rootMessage}")

exitProcess(1)
}
Expand Down
26 changes: 26 additions & 0 deletions src/main/kotlin/spypunk/sponge/SpongeConfig.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright © 2019-2020 spypunk <[email protected]>
*
* This work is free. You can redistribute it and/or modify it under the
* terms of the Do What The Fuck You Want To Public License, Version 2,
* as published by Sam Hocevar. See the COPYING file for more details.
*/

package spypunk.sponge

const val DEFAULT_MAXIMUM_DEPTH = 1
const val DEFAULT_MAXIMUM_URIS = 1_000_000
const val DEFAULT_INCLUDE_SUBDOMAINS = false
const val DEFAULT_CONCURRENT_REQUESTS = 1
const val DEFAULT_CONCURRENT_DOWNLOADS = 1

data class SpongeConfig(
val spongeUri: SpongeUri,
val mimeTypes: Set<String>,
val fileExtensions: Set<String>,
val maximumDepth: Int = DEFAULT_MAXIMUM_DEPTH,
val maximumUris: Int = DEFAULT_MAXIMUM_URIS,
val includeSubdomains: Boolean = DEFAULT_INCLUDE_SUBDOMAINS,
val concurrentRequests: Int = DEFAULT_CONCURRENT_REQUESTS,
val concurrentDownloads: Int = DEFAULT_CONCURRENT_DOWNLOADS
)
24 changes: 0 additions & 24 deletions src/main/kotlin/spypunk/sponge/SpongeInput.kt

This file was deleted.

Loading

0 comments on commit 286a3ba

Please sign in to comment.