diff options
Diffstat (limited to 'autofill-parser/src/main/java/mozilla/components')
4 files changed, 479 insertions, 0 deletions
diff --git a/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixList.kt b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixList.kt new file mode 100644 index 00000000..d4d1c1af --- /dev/null +++ b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixList.kt @@ -0,0 +1,139 @@ +/* + * SPDX-License-Identifier: (LGPL-3.0-only WITH LGPL-3.0-linking-exception) OR MPL-2.0 + */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +package mozilla.components.lib.publicsuffixlist + +import android.content.Context +import kotlinx.coroutines.CoroutineDispatcher +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Deferred +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.async + +/** + * API for reading and accessing the public suffix list. + * + * > A "public suffix" is one under which Internet users can (or historically could) directly register names. Some + * > examples of public suffixes are .com, .co.uk and pvt.k12.ma.us. The Public Suffix List is a list of all known + * > public suffixes. + * + * Note that this implementation applies the rules of the public suffix list only and does not validate domains. + * + * https://publicsuffix.org/ + * https://github.com/publicsuffix/list + */ +class PublicSuffixList( + context: Context, + dispatcher: CoroutineDispatcher = Dispatchers.IO, + private val scope: CoroutineScope = CoroutineScope(dispatcher) +) { + + private val data: PublicSuffixListData by lazy { PublicSuffixListLoader.load(context) } + + /** + * Prefetch the public suffix list from disk so that it is available in memory. + */ + fun prefetch(): Deferred<Unit> = scope.async { + data.run { Unit } + } + + /** + * Returns true if the given [domain] is a public suffix; false otherwise. + * + * E.g.: + * ``` + * co.uk -> true + * com -> true + * mozilla.org -> false + * org -> true + * ``` + * + * Note that this method ignores the default "prevailing rule" described in the formal public suffix list algorithm: + * If no rule matches then the passed [domain] is assumed to *not* be a public suffix. + * + * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values + * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result. + */ + fun isPublicSuffix(domain: String): Deferred<Boolean> = scope.async { + when (data.getPublicSuffixOffset(domain)) { + is PublicSuffixOffset.PublicSuffix -> true + else -> false + } + } + + /** + * Returns the public suffix and one more level; known as the registrable domain. Returns `null` if + * [domain] is a public suffix itself. + * + * E.g.: + * ``` + * wwww.mozilla.org -> mozilla.org + * www.bcc.co.uk -> bbc.co.uk + * a.b.ide.kyoto.jp -> b.ide.kyoto.jp + * ``` + * + * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values + * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result. + */ + fun getPublicSuffixPlusOne(domain: String): Deferred<String?> = scope.async { + when (val offset = data.getPublicSuffixOffset(domain)) { + is PublicSuffixOffset.Offset -> domain + .split('.') + .drop(offset.value) + .joinToString(separator = ".") + else -> null + } + } + + /** + * Returns the public suffix of the given [domain]; known as the effective top-level domain (eTLD). Returns `null` + * if the [domain] is a public suffix itself. + * + * E.g.: + * ``` + * wwww.mozilla.org -> org + * www.bcc.co.uk -> co.uk + * a.b.ide.kyoto.jp -> ide.kyoto.jp + * ``` + * + * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values + * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result. + */ + fun getPublicSuffix(domain: String) = scope.async { + when (val offset = data.getPublicSuffixOffset(domain)) { + is PublicSuffixOffset.Offset -> domain + .split('.') + .drop(offset.value + 1) + .joinToString(separator = ".") + else -> null + } + } + + /** + * Strips the public suffix from the given [domain]. Returns the original domain if no public suffix could be + * stripped. + * + * E.g.: + * ``` + * wwww.mozilla.org -> www.mozilla + * www.bcc.co.uk -> www.bbc + * a.b.ide.kyoto.jp -> a.b + * ``` + * + * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values + * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result. + */ + fun stripPublicSuffix(domain: String) = scope.async { + when (val offset = data.getPublicSuffixOffset(domain)) { + is PublicSuffixOffset.Offset -> domain + .split('.') + .joinToString(separator = ".", limit = offset.value + 1, truncated = "") + .dropLast(1) + else -> domain + } + } +} diff --git a/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListData.kt b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListData.kt new file mode 100644 index 00000000..595d46b1 --- /dev/null +++ b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListData.kt @@ -0,0 +1,163 @@ +/* + * SPDX-License-Identifier: (LGPL-3.0-only WITH LGPL-3.0-linking-exception) OR MPL-2.0 + */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +package mozilla.components.lib.publicsuffixlist + +import mozilla.components.lib.publicsuffixlist.ext.binarySearch +import java.net.IDN + +/** + * Class wrapping the public suffix list data and offering methods for accessing rules in it. + */ +internal class PublicSuffixListData( + private val rules: ByteArray, + private val exceptions: ByteArray +) { + + private fun binarySearchRules(labels: List<ByteArray>, labelIndex: Int): String? { + return rules.binarySearch(labels, labelIndex) + } + + private fun binarySearchExceptions(labels: List<ByteArray>, labelIndex: Int): String? { + return exceptions.binarySearch(labels, labelIndex) + } + + @Suppress("ReturnCount") + fun getPublicSuffixOffset(domain: String): PublicSuffixOffset? { + if (domain.isEmpty()) { + return null + } + + val domainLabels = IDN.toUnicode(domain).split('.') + if (domainLabels.find { it.isEmpty() } != null) { + // At least one of the labels is empty: Bail out. + return null + } + + val rule = findMatchingRule(domainLabels) + + if (domainLabels.size == rule.size && rule[0][0] != PublicSuffixListData.EXCEPTION_MARKER) { + // The domain is a public suffix. + return if (rule == PublicSuffixListData.PREVAILING_RULE) { + PublicSuffixOffset.PrevailingRule + } else { + PublicSuffixOffset.PublicSuffix + } + } + + return if (rule[0][0] == PublicSuffixListData.EXCEPTION_MARKER) { + // Exception rules hold the effective TLD plus one. + PublicSuffixOffset.Offset(domainLabels.size - rule.size) + } else { + // Otherwise the rule is for a public suffix, so we must take one more label. + PublicSuffixOffset.Offset(domainLabels.size - (rule.size + 1)) + } + } + + /** + * Find a matching rule for the given domain labels. + * + * This algorithm is based on OkHttp's PublicSuffixDatabase class: + * https://github.com/square/okhttp/blob/master/okhttp/src/main/java/okhttp3/internal/publicsuffix/PublicSuffixDatabase.java + */ + private fun findMatchingRule(domainLabels: List<String>): List<String> { + // Break apart the domain into UTF-8 labels, i.e. foo.bar.com turns into [foo, bar, com]. + val domainLabelsBytes = domainLabels.map { it.toByteArray(Charsets.UTF_8) } + + val exactMatch = findExactMatch(domainLabelsBytes) + val wildcardMatch = findWildcardMatch(domainLabelsBytes) + val exceptionMatch = findExceptionMatch(domainLabelsBytes, wildcardMatch) + + if (exceptionMatch != null) { + return ("${PublicSuffixListData.EXCEPTION_MARKER}$exceptionMatch").split('.') + } + + if (exactMatch == null && wildcardMatch == null) { + return PublicSuffixListData.PREVAILING_RULE + } + + val exactRuleLabels = exactMatch?.split('.') ?: PublicSuffixListData.EMPTY_RULE + val wildcardRuleLabels = wildcardMatch?.split('.') ?: PublicSuffixListData.EMPTY_RULE + + return if (exactRuleLabels.size > wildcardRuleLabels.size) { + exactRuleLabels + } else { + wildcardRuleLabels + } + } + + /** + * Returns an exact match or null. + */ + private fun findExactMatch(labels: List<ByteArray>): String? { + // Start by looking for exact matches. We start at the leftmost label. For example, foo.bar.com + // will look like: [foo, bar, com], [bar, com], [com]. The longest matching rule wins. + + for (i in 0 until labels.size) { + val rule = binarySearchRules(labels, i) + + if (rule != null) { + return rule + } + } + + return null + } + + /** + * Returns a wildcard match or null. + */ + private fun findWildcardMatch(labels: List<ByteArray>): String? { + // In theory, wildcard rules are not restricted to having the wildcard in the leftmost position. + // In practice, wildcards are always in the leftmost position. For now, this implementation + // cheats and does not attempt every possible permutation. Instead, it only considers wildcards + // in the leftmost position. We assert this fact when we generate the public suffix file. If + // this assertion ever fails we'll need to refactor this implementation. + if (labels.size > 1) { + val labelsWithWildcard = labels.toMutableList() + for (labelIndex in 0 until labelsWithWildcard.size) { + labelsWithWildcard[labelIndex] = PublicSuffixListData.WILDCARD_LABEL + val rule = binarySearchRules(labelsWithWildcard, labelIndex) + if (rule != null) { + return rule + } + } + } + + return null + } + + private fun findExceptionMatch(labels: List<ByteArray>, wildcardMatch: String?): String? { + // Exception rules only apply to wildcard rules, so only try it if we matched a wildcard. + if (wildcardMatch == null) { + return null + } + + for (labelIndex in 0 until labels.size) { + val rule = binarySearchExceptions(labels, labelIndex) + if (rule != null) { + return rule + } + } + + return null + } + + companion object { + + val WILDCARD_LABEL = byteArrayOf('*'.toByte()) + val PREVAILING_RULE = listOf("*") + val EMPTY_RULE = listOf<String>() + const val EXCEPTION_MARKER = '!' + } +} + +internal sealed class PublicSuffixOffset { + data class Offset(val value: Int) : PublicSuffixOffset() + object PublicSuffix : PublicSuffixOffset() + object PrevailingRule : PublicSuffixOffset() +} diff --git a/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListLoader.kt b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListLoader.kt new file mode 100644 index 00000000..d8542f94 --- /dev/null +++ b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListLoader.kt @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: (LGPL-3.0-only WITH LGPL-3.0-linking-exception) OR MPL-2.0 + */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +package mozilla.components.lib.publicsuffixlist + +import android.content.Context +import java.io.BufferedInputStream +import java.io.IOException + +private const val PUBLIC_SUFFIX_LIST_FILE = "publicsuffixes" + +internal object PublicSuffixListLoader { + + fun load(context: Context): PublicSuffixListData = context.assets.open( + PUBLIC_SUFFIX_LIST_FILE + ).buffered().use { stream -> + val publicSuffixSize = stream.readInt() + val publicSuffixBytes = stream.readFully(publicSuffixSize) + + val exceptionSize = stream.readInt() + val exceptionBytes = stream.readFully(exceptionSize) + + PublicSuffixListData(publicSuffixBytes, exceptionBytes) + } +} + +@Suppress("MagicNumber") +private fun BufferedInputStream.readInt(): Int { + return (read() and 0xff shl 24 + or (read() and 0xff shl 16) + or (read() and 0xff shl 8) + or (read() and 0xff)) +} + +private fun BufferedInputStream.readFully(size: Int): ByteArray { + val bytes = ByteArray(size) + + var offset = 0 + while (offset < size) { + val read = read(bytes, offset, size - offset) + if (read == -1) { + throw IOException("Unexpected end of stream") + } + offset += read + } + + return bytes +} diff --git a/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/ext/ByteArray.kt b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/ext/ByteArray.kt new file mode 100644 index 00000000..fbe8ec5c --- /dev/null +++ b/autofill-parser/src/main/java/mozilla/components/lib/publicsuffixlist/ext/ByteArray.kt @@ -0,0 +1,125 @@ +/* + * SPDX-License-Identifier: (LGPL-3.0-only WITH LGPL-3.0-linking-exception) OR MPL-2.0 + */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +package mozilla.components.lib.publicsuffixlist.ext + +import kotlin.experimental.and + +private const val BITMASK = 0xff.toByte() + +/** + * Performs a binary search for the provided [labels] on the [ByteArray]'s data. + * + * This algorithm is based on OkHttp's PublicSuffixDatabase class: + * https://github.com/square/okhttp/blob/1977136/okhttp/src/main/kotlin/okhttp3/internal/publicsuffix/PublicSuffixDatabase.kt + */ +@Suppress("ComplexMethod", "NestedBlockDepth") +internal fun ByteArray.binarySearch(labels: List<ByteArray>, labelIndex: Int): String? { + var low = 0 + var high = size + var match: String? = null + + while (low < high) { + val mid = (low + high) / 2 + val start = findStartOfLineFromIndex(mid) + val end = findEndOfLineFromIndex(start) + + val publicSuffixLength = start + end - start + + var compareResult: Int + var currentLabelIndex = labelIndex + var currentLabelByteIndex = 0 + var publicSuffixByteIndex = 0 + + var expectDot = false + while (true) { + val byte0 = if (expectDot) { + expectDot = false + '.'.toByte() + } else { + labels[currentLabelIndex][currentLabelByteIndex] and BITMASK + } + + val byte1 = this[start + publicSuffixByteIndex] and BITMASK + + // Compare the bytes. Note that the file stores UTF-8 encoded bytes, so we must compare the + // unsigned bytes. + @Suppress("EXPERIMENTAL_API_USAGE") + compareResult = (byte0.toUByte() - byte1.toUByte()).toInt() + if (compareResult != 0) { + break + } + + publicSuffixByteIndex++ + currentLabelByteIndex++ + + if (publicSuffixByteIndex == publicSuffixLength) { + break + } + + if (labels[currentLabelIndex].size == currentLabelByteIndex) { + // We've exhausted our current label. Either there are more labels to compare, in which + // case we expect a dot as the next character. Otherwise, we've checked all our labels. + if (currentLabelIndex == labels.size - 1) { + break + } else { + currentLabelIndex++ + currentLabelByteIndex = -1 + expectDot = true + } + } + } + + if (compareResult < 0) { + high = start - 1 + } else if (compareResult > 0) { + low = start + end + 1 + } else { + // We found a match, but are the lengths equal? + val publicSuffixBytesLeft = publicSuffixLength - publicSuffixByteIndex + var labelBytesLeft = labels[currentLabelIndex].size - currentLabelByteIndex + for (i in currentLabelIndex + 1 until labels.size) { + labelBytesLeft += labels[i].size + } + + if (labelBytesLeft < publicSuffixBytesLeft) { + high = start - 1 + } else if (labelBytesLeft > publicSuffixBytesLeft) { + low = start + end + 1 + } else { + // Found a match. + match = String(this, start, publicSuffixLength, Charsets.UTF_8) + break + } + } + } + + return match +} + +/** + * Search for a '\n' that marks the start of a value. Don't go back past the start of the array. + */ +private fun ByteArray.findStartOfLineFromIndex(start: Int): Int { + var index = start + while (index > -1 && this[index] != '\n'.toByte()) { + index-- + } + index++ + return index +} + +/** + * Search for a '\n' that marks the end of a value. + */ +private fun ByteArray.findEndOfLineFromIndex(start: Int): Int { + var end = 1 + while (this[start + end] != '\n'.toByte()) { + end++ + } + return end +} |