4 files changed, 475 insertions, 0 deletions
diff --git a/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixList.kt b/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixList.kt
new file mode 100644
index 00000000..6df3caca
--- /dev/null
+++ b/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixList.kt
@@ -0,0 +1,138 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only OR MPL-2.0
+ */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+package mozilla.components.lib.publicsuffixlist
+
+import android.content.Context
+import kotlinx.coroutines.CoroutineDispatcher
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Deferred
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.async
+
+/**
+ * API for reading and accessing the public suffix list.
+ *
+ * > A "public suffix" is one under which Internet users can (or historically could) directly register names. Some
+ * > examples of public suffixes are .com, .co.uk and pvt.k12.ma.us. The Public Suffix List is a list of all known
+ * > public suffixes.
+ *
+ * Note that this implementation applies the rules of the public suffix list only and does not validate domains.
+ *
+ * https://publicsuffix.org/
+ * https://github.com/publicsuffix/list
+ */
+class PublicSuffixList(
+    context: Context,
+    dispatcher: CoroutineDispatcher = Dispatchers.IO,
+    private val scope: CoroutineScope = CoroutineScope(dispatcher)
+) {
+    private val data: PublicSuffixListData by lazy { PublicSuffixListLoader.load(context) }
+
+    /**
+     * Prefetch the public suffix list from disk so that it is available in memory.
+     */
+    fun prefetch(): Deferred<Unit> = scope.async {
+        data.run { Unit }
+    }
+
+    /**
+     * Returns true if the given [domain] is a public suffix; false otherwise.
+     *
+     * E.g.:
+     * ```
+     *   co.uk       -> true
+     *   com         -> true
+     *   mozilla.org -> false
+     *   org         -> true
+     * ```
+     *
+     * Note that this method ignores the default "prevailing rule" described in the formal public suffix list algorithm:
+     * If no rule matches then the passed [domain] is assumed to *not* be a public suffix.
+     *
+     * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values
+     * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result.
+     */
+    fun isPublicSuffix(domain: String): Deferred<Boolean> = scope.async {
+        when (data.getPublicSuffixOffset(domain)) {
+            is PublicSuffixOffset.PublicSuffix -> true
+            else -> false
+        }
+    }
+
+    /**
+     * Returns the public suffix and one more level; known as the registrable domain. Returns `null` if
+     * [domain] is a public suffix itself.
+     *
+     * E.g.:
+     * ```
+     * wwww.mozilla.org -> mozilla.org
+     * www.bcc.co.uk    -> bbc.co.uk
+     * a.b.ide.kyoto.jp -> b.ide.kyoto.jp
+     * ```
+     *
+     * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values
+     * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result.
+     */
+    fun getPublicSuffixPlusOne(domain: String): Deferred<String?> = scope.async {
+        when (val offset = data.getPublicSuffixOffset(domain)) {
+            is PublicSuffixOffset.Offset -> domain
+                .split('.')
+                .drop(offset.value)
+                .joinToString(separator = ".")
+            else -> null
+        }
+    }
+
+    /**
+     * Returns the public suffix of the given [domain]; known as the effective top-level domain (eTLD). Returns `null`
+     * if the [domain] is a public suffix itself.
+     *
+     * E.g.:
+     * ```
+     * wwww.mozilla.org -> org
+     * www.bcc.co.uk    -> co.uk
+     * a.b.ide.kyoto.jp -> ide.kyoto.jp
+     * ```
+     *
+     * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values
+     * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result.
+     */
+    fun getPublicSuffix(domain: String) = scope.async {
+        when (val offset = data.getPublicSuffixOffset(domain)) {
+            is PublicSuffixOffset.Offset -> domain
+                .split('.')
+                .drop(offset.value + 1)
+                .joinToString(separator = ".")
+            else -> null
+        }
+    }
+
+    /**
+     * Strips the public suffix from the given [domain]. Returns the original domain if no public suffix could be
+     * stripped.
+     *
+     * E.g.:
+     * ```
+     * wwww.mozilla.org -> www.mozilla
+     * www.bcc.co.uk    -> www.bbc
+     * a.b.ide.kyoto.jp -> a.b
+     * ```
+     *
+     * @param [domain] _must_ be a valid domain. [PublicSuffixList] performs no validation, and if any unexpected values
+     * are passed (e.g., a full URL, a domain with a trailing '/', etc) this may return an incorrect result.
+     */
+    fun stripPublicSuffix(domain: String) = scope.async {
+        when (val offset = data.getPublicSuffixOffset(domain)) {
+            is PublicSuffixOffset.Offset -> domain
+                .split('.')
+                .joinToString(separator = ".", limit = offset.value + 1, truncated = "")
+                .dropLast(1)
+            else -> domain
+        }
+    }
+}
diff --git a/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListData.kt b/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListData.kt
new file mode 100644
index 00000000..0cbf6945
--- /dev/null
+++ b/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListData.kt
@@ -0,0 +1,161 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only OR MPL-2.0
+ */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+package mozilla.components.lib.publicsuffixlist
+
+import mozilla.components.lib.publicsuffixlist.ext.binarySearch
+import java.net.IDN
+
+/**
+ * Class wrapping the public suffix list data and offering methods for accessing rules in it.
+ */
+internal class PublicSuffixListData(
+    private val rules: ByteArray,
+    private val exceptions: ByteArray
+) {
+    private fun binarySearchRules(labels: List<ByteArray>, labelIndex: Int): String? {
+        return rules.binarySearch(labels, labelIndex)
+    }
+
+    private fun binarySearchExceptions(labels: List<ByteArray>, labelIndex: Int): String? {
+        return exceptions.binarySearch(labels, labelIndex)
+    }
+
+    @Suppress("ReturnCount")
+    fun getPublicSuffixOffset(domain: String): PublicSuffixOffset? {
+        if (domain.isEmpty()) {
+            return null
+        }
+
+        val domainLabels = IDN.toUnicode(domain).split('.')
+        if (domainLabels.find { it.isEmpty() } != null) {
+            // At least one of the labels is empty: Bail out.
+            return null
+        }
+
+        val rule = findMatchingRule(domainLabels)
+
+        if (domainLabels.size == rule.size && rule[0][0] != PublicSuffixListData.EXCEPTION_MARKER) {
+            // The domain is a public suffix.
+            return if (rule == PublicSuffixListData.PREVAILING_RULE) {
+                PublicSuffixOffset.PrevailingRule
+            } else {
+                PublicSuffixOffset.PublicSuffix
+            }
+        }
+
+        return if (rule[0][0] == PublicSuffixListData.EXCEPTION_MARKER) {
+            // Exception rules hold the effective TLD plus one.
+            PublicSuffixOffset.Offset(domainLabels.size - rule.size)
+        } else {
+            // Otherwise the rule is for a public suffix, so we must take one more label.
+            PublicSuffixOffset.Offset(domainLabels.size - (rule.size + 1))
+        }
+    }
+
+    /**
+     * Find a matching rule for the given domain labels.
+     *
+     * This algorithm is based on OkHttp's PublicSuffixDatabase class:
+     * https://github.com/square/okhttp/blob/master/okhttp/src/main/java/okhttp3/internal/publicsuffix/PublicSuffixDatabase.java
+     */
+    private fun findMatchingRule(domainLabels: List<String>): List<String> {
+        // Break apart the domain into UTF-8 labels, i.e. foo.bar.com turns into [foo, bar, com].
+        val domainLabelsBytes = domainLabels.map { it.toByteArray(Charsets.UTF_8) }
+
+        val exactMatch = findExactMatch(domainLabelsBytes)
+        val wildcardMatch = findWildcardMatch(domainLabelsBytes)
+        val exceptionMatch = findExceptionMatch(domainLabelsBytes, wildcardMatch)
+
+        if (exceptionMatch != null) {
+            return ("${PublicSuffixListData.EXCEPTION_MARKER}$exceptionMatch").split('.')
+        }
+
+        if (exactMatch == null && wildcardMatch == null) {
+            return PublicSuffixListData.PREVAILING_RULE
+        }
+
+        val exactRuleLabels = exactMatch?.split('.') ?: PublicSuffixListData.EMPTY_RULE
+        val wildcardRuleLabels = wildcardMatch?.split('.') ?: PublicSuffixListData.EMPTY_RULE
+
+        return if (exactRuleLabels.size > wildcardRuleLabels.size) {
+            exactRuleLabels
+        } else {
+            wildcardRuleLabels
+        }
+    }
+
+    /**
+     * Returns an exact match or null.
+     */
+    private fun findExactMatch(labels: List<ByteArray>): String? {
+        // Start by looking for exact matches. We start at the leftmost label. For example, foo.bar.com
+        // will look like: [foo, bar, com], [bar, com], [com]. The longest matching rule wins.
+
+        for (i in 0 until labels.size) {
+            val rule = binarySearchRules(labels, i)
+
+            if (rule != null) {
+                return rule
+            }
+        }
+
+        return null
+    }
+
+    /**
+     * Returns a wildcard match or null.
+     */
+    private fun findWildcardMatch(labels: List<ByteArray>): String? {
+        // In theory, wildcard rules are not restricted to having the wildcard in the leftmost position.
+        // In practice, wildcards are always in the leftmost position. For now, this implementation
+        // cheats and does not attempt every possible permutation. Instead, it only considers wildcards
+        // in the leftmost position. We assert this fact when we generate the public suffix file. If
+        // this assertion ever fails we'll need to refactor this implementation.
+        if (labels.size > 1) {
+            val labelsWithWildcard = labels.toMutableList()
+            for (labelIndex in 0 until labelsWithWildcard.size) {
+                labelsWithWildcard[labelIndex] = PublicSuffixListData.WILDCARD_LABEL
+                val rule = binarySearchRules(labelsWithWildcard, labelIndex)
+                if (rule != null) {
+                    return rule
+                }
+            }
+        }
+
+        return null
+    }
+
+    private fun findExceptionMatch(labels: List<ByteArray>, wildcardMatch: String?): String? {
+        // Exception rules only apply to wildcard rules, so only try it if we matched a wildcard.
+        if (wildcardMatch == null) {
+            return null
+        }
+
+        for (labelIndex in 0 until labels.size) {
+            val rule = binarySearchExceptions(labels, labelIndex)
+            if (rule != null) {
+                return rule
+            }
+        }
+
+        return null
+    }
+
+    companion object {
+        val WILDCARD_LABEL = byteArrayOf('*'.toByte())
+        val PREVAILING_RULE = listOf("*")
+        val EMPTY_RULE = listOf<String>()
+        const val EXCEPTION_MARKER = '!'
+    }
+}
+
+internal sealed class PublicSuffixOffset {
+    data class Offset(val value: Int) : PublicSuffixOffset()
+    object PublicSuffix : PublicSuffixOffset()
+    object PrevailingRule : PublicSuffixOffset()
+}
diff --git a/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListLoader.kt b/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListLoader.kt
new file mode 100644
index 00000000..88e82523
--- /dev/null
+++ b/app/src/main/java/mozilla/components/lib/publicsuffixlist/PublicSuffixListLoader.kt
@@ -0,0 +1,51 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only OR MPL-2.0
+ */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+package mozilla.components.lib.publicsuffixlist
+
+import android.content.Context
+import java.io.BufferedInputStream
+import java.io.IOException
+
+private const val PUBLIC_SUFFIX_LIST_FILE = "publicsuffixes"
+
+internal object PublicSuffixListLoader {
+    fun load(context: Context): PublicSuffixListData = context.assets.open(
+        PUBLIC_SUFFIX_LIST_FILE
+    ).buffered().use { stream ->
+        val publicSuffixSize = stream.readInt()
+        val publicSuffixBytes = stream.readFully(publicSuffixSize)
+
+        val exceptionSize = stream.readInt()
+        val exceptionBytes = stream.readFully(exceptionSize)
+
+        PublicSuffixListData(publicSuffixBytes, exceptionBytes)
+    }
+}
+
+@Suppress("MagicNumber")
+private fun BufferedInputStream.readInt(): Int {
+    return (read() and 0xff shl 24
+        or (read() and 0xff shl 16)
+        or (read() and 0xff shl 8)
+        or (read() and 0xff))
+}
+
+private fun BufferedInputStream.readFully(size: Int): ByteArray {
+    val bytes = ByteArray(size)
+
+    var offset = 0
+    while (offset < size) {
+        val read = read(bytes, offset, size - offset)
+        if (read == -1) {
+            throw IOException("Unexpected end of stream")
+        }
+        offset += read
+    }
+
+    return bytes
+}
diff --git a/app/src/main/java/mozilla/components/lib/publicsuffixlist/ext/ByteArray.kt b/app/src/main/java/mozilla/components/lib/publicsuffixlist/ext/ByteArray.kt
new file mode 100644
index 00000000..5abb9154
--- /dev/null
+++ b/app/src/main/java/mozilla/components/lib/publicsuffixlist/ext/ByteArray.kt
@@ -0,0 +1,125 @@
+/*
+ * SPDX-License-Identifier: GPL-3.0-only OR MPL-2.0
+ */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+package mozilla.components.lib.publicsuffixlist.ext
+
+import kotlin.experimental.and
+
+private const val BITMASK = 0xff.toByte()
+
+/**
+ * Performs a binary search for the provided [labels] on the [ByteArray]'s data.
+ *
+ * This algorithm is based on OkHttp's PublicSuffixDatabase class:
+ * https://github.com/square/okhttp/blob/master/okhttp/src/main/java/okhttp3/internal/publicsuffix/PublicSuffixDatabase.java
+ */
+@Suppress("ComplexMethod", "NestedBlockDepth")
+internal fun ByteArray.binarySearch(labels: List<ByteArray>, labelIndex: Int): String? {
+    var low = 0
+    var high = size
+    var match: String? = null
+
+    while (low < high) {
+        val mid = (low + high) / 2
+        val start = findStartOfLineFromIndex(mid)
+        val end = findEndOfLineFromIndex(start)
+
+        val publicSuffixLength = start + end - start
+
+        var compareResult: Int
+        var currentLabelIndex = labelIndex
+        var currentLabelByteIndex = 0
+        var publicSuffixByteIndex = 0
+
+        var expectDot = false
+        while (true) {
+            val byte0 = if (expectDot) {
+                expectDot = false
+                '.'.toByte()
+            } else {
+                labels[currentLabelIndex][currentLabelByteIndex] and BITMASK
+            }
+
+            val byte1 = this[start + publicSuffixByteIndex] and BITMASK
+
+            // Compare the bytes. Note that the file stores UTF-8 encoded bytes, so we must compare the
+            // unsigned bytes.
+            @Suppress("EXPERIMENTAL_API_USAGE")
+            compareResult = (byte0.toUByte() - byte1.toUByte()).toInt()
+            if (compareResult != 0) {
+                break
+            }
+
+            publicSuffixByteIndex++
+            currentLabelByteIndex++
+
+            if (publicSuffixByteIndex == publicSuffixLength) {
+                break
+            }
+
+            if (labels[currentLabelIndex].size == currentLabelByteIndex) {
+                // We've exhausted our current label. Either there are more labels to compare, in which
+                // case we expect a dot as the next character. Otherwise, we've checked all our labels.
+                if (currentLabelIndex == labels.size - 1) {
+                    break
+                } else {
+                    currentLabelIndex++
+                    currentLabelByteIndex = -1
+                    expectDot = true
+                }
+            }
+        }
+
+        if (compareResult < 0) {
+            high = start - 1
+        } else if (compareResult > 0) {
+            low = start + end + 1
+        } else {
+            // We found a match, but are the lengths equal?
+            val publicSuffixBytesLeft = publicSuffixLength - publicSuffixByteIndex
+            var labelBytesLeft = labels[currentLabelIndex].size - currentLabelByteIndex
+            for (i in currentLabelIndex + 1 until labels.size) {
+                labelBytesLeft += labels[i].size
+            }
+
+            if (labelBytesLeft < publicSuffixBytesLeft) {
+                high = start - 1
+            } else if (labelBytesLeft > publicSuffixBytesLeft) {
+                low = start + end + 1
+            } else {
+                // Found a match.
+                match = String(this, start, publicSuffixLength, Charsets.UTF_8)
+                break
+            }
+        }
+    }
+
+    return match
+}
+
+/**
+ * Search for a '\n' that marks the start of a value. Don't go back past the start of the array.
+ */
+private fun ByteArray.findStartOfLineFromIndex(start: Int): Int {
+    var index = start
+    while (index > -1 && this[index] != '\n'.toByte()) {
+        index--
+    }
+    index++
+    return index
+}
+
+/**
+ * Search for a '\n' that marks the end of a value.
+ */
+private fun ByteArray.findEndOfLineFromIndex(start: Int): Int {
+    var end = 1
+    while (this[start + end] != '\n'.toByte()) {
+        end++
+    }
+    return end
+}