From 9f45f935ac5c57381ee9580d84f40189c2ba8994 Mon Sep 17 00:00:00 2001
From: grizzlyuser <grizzlyuser@protonmail.com>
Date: Sun, 4 Apr 2021 23:16:26 +0300
Subject: libre/iceweasel: JSON processing script updates

Refactor and add processing of top-sites.json Remote Settings dump,
which is needed to override default sites with nonfree content which are
displayed on new tab and in the dropdown of address bar.
---
 libre/iceweasel/PKGBUILD              |   2 +-
 libre/iceweasel/process-json-files.py | 207 ++++++++++++++++++++++------------
 2 files changed, 133 insertions(+), 76 deletions(-)

diff --git a/libre/iceweasel/PKGBUILD b/libre/iceweasel/PKGBUILD
index 191ebdb88..1927046a3 100644
--- a/libre/iceweasel/PKGBUILD
+++ b/libre/iceweasel/PKGBUILD
@@ -93,7 +93,7 @@ sha256sums+=('79c957263eb8eb99cdb5cbadf689883b04d4ad197ab956a1dbd1576bfbdce9ae'
              'SKIP'
              '46171d428125c524c03588f1c5fbb7778e68ef558b5faa54be84f4df5e546427'
              'a6afaac11aa0b8ff6d32470681df54a2acfe8f8b62afe0c19dc168b1424ba098'
-             '6c5b6dda26503a7d4e00b81249225cc5ba03c3bde8b14bb65cb47648e372efee'
+             'c38c5f5937bcedcd5645d63bf5787e0336d0b006b29f64c45414bc3a6e83c3be'
              '714998c5fc379f54a66ff80a845b7880169cd5b4094b77b719a99d33b65c0940')
 validpgpkeys=('14F26682D0916CDD81E37B6D61B7B526D98F0353') # Mozilla Software Releases <release@mozilla.com>
 validpgpkeys+=('BFA8008A8265677063B11BF47171986E4B745536') # Andreas Grapentin
diff --git a/libre/iceweasel/process-json-files.py b/libre/iceweasel/process-json-files.py
index a972e90e2..69264dc94 100644
--- a/libre/iceweasel/process-json-files.py
+++ b/libre/iceweasel/process-json-files.py
@@ -1,6 +1,6 @@
 #! /usr/bin/python3
 
-#    Copyright (C) 2020  grizzlyuser <grizzlyuser@protonmail.com>
+#    Copyright (C) 2020, 2021  grizzlyuser <grizzlyuser@protonmail.com>
 #    Based on: https://gitlab.trisquel.org/trisquel/wrapage-helpers/-/blob/81881d89b2bf7d502dd14fcccdb471fec6f6b206/helpers/DATA/firefox/reprocess-search-config.py
 #    Below is the notice from the original author:
 #
@@ -42,6 +42,7 @@ parser.add_argument(
     '-i',
     '--indent',
     type=int,
+    default=2,
     help='indent for pretty printing of output files')
 arguments = parser.parse_args()
 
@@ -49,103 +50,127 @@ File = namedtuple('File', ['path', 'content'])
 
 
 class RemoteSettings:
-    DUMPS_PATH = arguments.MAIN_PATH / 'services/settings/dumps'
-    JSON_PATHS = tuple(DUMPS_PATH.glob('*/*.json'))
-    WRAPPER_NAME = 'data'
+    DUMPS_PATH_RELATIVE = 'services/settings/dumps'
+    DUMPS_PATH_ABSOLUTE = arguments.MAIN_PATH / DUMPS_PATH_RELATIVE
+
+    _WRAPPER_NAME = 'data'
 
     @classmethod
     def wrap(cls, processed):
-        return File(processed.path, {cls.WRAPPER_NAME: processed.content})
+        return File(processed.path, {cls._WRAPPER_NAME: processed.content})
 
     @classmethod
     def unwrap(cls, parsed_jsons):
-        return [File(json.path, json.content[cls.WRAPPER_NAME])
+        return [File(json.path, json.content[cls._WRAPPER_NAME])
                 for json in parsed_jsons]
 
     @classmethod
-    def process_raw(cls, unwrapped_jsons):
-        changes = []
-        output_path = cls.DUMPS_PATH / 'monitor/changes.json'
+    def should_modify_collection(cls, collection):
+        return True
 
+    @classmethod
+    def process_raw(cls, unwrapped_jsons, parsed_schema):
+        timestamps, result = [], []
         for collection in unwrapped_jsons:
-            if collection.path == cls.DUMPS_PATH / 'main/example.json':
-                continue
-            latest_change = {}
-            latest_change['last_modified'] = max(
-                (record['last_modified'] for record in collection.content), default=0)
-            latest_change['bucket'] = collection.path.parent.name
-            latest_change['collection'] = collection.path.stem
-            changes.append(latest_change)
+            should_modify_collection = cls.should_modify_collection(collection)
+            for record in collection.content:
+                if should_modify_collection:
+                    if cls.should_drop_record(record):
+                        continue
 
-        output_path.parent.mkdir(exist_ok=True)
+                    clone = copy.deepcopy(record)
 
-        return File(output_path, changes)
+                    record = cls.process_record(record)
 
-    @classmethod
-    def process(cls, parsed_jsons):
-        return cls.wrap(cls.process_raw(cls.unwrap(parsed_jsons)))
+                    if clone != record:
+                        timestamp = int(round(time.time_ns() / 10 ** 6))
+                        while timestamp in timestamps:
+                            timestamp += 1
+                        timestamps.append(timestamp)
+                        record['last_modified'] = timestamp
 
+                if parsed_schema is not None:
+                    validate(record, schema=parsed_schema)
 
-class SearchConfig(RemoteSettings):
-    JSON_PATHS = (RemoteSettings.DUMPS_PATH / 'main/search-config.json',)
+                result.append(record)
+
+        cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
 
-    def _get_schema():
-        PATH = arguments.MAIN_PATH / \
-            'toolkit/components/search/schema/search-engine-config-schema.json'
-        with PATH.open() as file:
-            return json.load(file)
+        return File(cls.OUTPUT_PATH, result)
 
     @classmethod
-    def process_raw(cls, unwrapped_jsons):
-        _WHITELIST = ('ddg@search.mozilla.org', 'wikipedia@search.mozilla.org')
-        SCHEMA = cls._get_schema()
+    def process(cls, parsed_jsons, parsed_schema):
+        return cls.wrap(
+            cls.process_raw(
+                cls.unwrap(parsed_jsons),
+                parsed_schema))
 
-        search_engines, timestamps = [], []
-        search_config = unwrapped_jsons[0]
 
-        for search_engine in search_config.content:
-            if search_engine['webExtension']['id'] in _WHITELIST:
-                clone = copy.deepcopy(search_engine)
+class Changes(RemoteSettings):
+    JSON_PATHS = tuple(RemoteSettings.DUMPS_PATH_ABSOLUTE.glob('*/*.json'))
+    OUTPUT_PATH = RemoteSettings.DUMPS_PATH_ABSOLUTE / 'monitor/changes.json'
 
-                if 'telemetryId' in search_engine:
-                    del search_engine['telemetryId']
-                if 'extraParams' in search_engine:
-                    del search_engine['extraParams']
+    @classmethod
+    def process_raw(cls, unwrapped_jsons, parsed_schema):
+        changes = []
+
+        for collection in unwrapped_jsons:
+            if collection.path != RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/example.json':
+                latest_change = {}
+                latest_change['last_modified'] = max(
+                    (record['last_modified'] for record in collection.content), default=0)
+                latest_change['bucket'] = collection.path.parent.name
+                latest_change['collection'] = collection.path.stem
+                changes.append(latest_change)
+
+        cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+
+        return File(cls.OUTPUT_PATH, changes)
+
+
+class SearchConfig(RemoteSettings):
+    JSON_PATHS = (
+        RemoteSettings.DUMPS_PATH_ABSOLUTE /
+        'main/search-config.json',
+    )
+    SCHEMA_PATH = arguments.MAIN_PATH / \
+        'toolkit/components/search/schema/search-engine-config-schema.json'
+    OUTPUT_PATH = JSON_PATHS[0]
 
-                general_specifier = {}
-                for specifier in search_engine['appliesTo'].copy():
-                    if 'application' in specifier:
-                        if 'distributions' in specifier['application']:
-                            search_engine['appliesTo'].remove(specifier)
-                            continue
-                        if 'extraParams' in specifier['application']:
-                            del specifier['application']['extraParams']
+    _DUCKDUCKGO_SEARCH_ENGINE_ID = 'ddg@search.mozilla.org'
 
-                    if 'included' in specifier and 'everywhere' in specifier[
-                            'included'] and specifier['included']['everywhere']:
-                        general_specifier = specifier
+    @classmethod
+    def should_drop_record(cls, search_engine):
+        return search_engine['webExtension']['id'] not in (
+            cls._DUCKDUCKGO_SEARCH_ENGINE_ID, 'wikipedia@search.mozilla.org')
 
-                if not general_specifier:
-                    general_specifier = {'included': {'everywhere': True}}
-                    search_engine['appliesTo'].insert(0, general_specifier)
-                if search_engine['webExtension']['id'] == _WHITELIST[0]:
-                    general_specifier['default'] = 'yes'
+    @classmethod
+    def process_record(cls, search_engine):
+        [search_engine.pop(key, None)
+         for key in ['extraParams', 'telemetryId']]
 
-                if clone != search_engine:
-                    timestamp = int(round(time.time_ns() / 10 ** 6))
-                    while timestamp in timestamps:
-                        timestamp += 1
-                    timestamps.append(timestamp)
-                    search_engine['last_modified'] = timestamp
+        general_specifier = {}
+        for specifier in search_engine['appliesTo'].copy():
+            if 'application' in specifier:
+                if 'distributions' in specifier['application']:
+                    search_engine['appliesTo'].remove(specifier)
+                    continue
+                specifier['application'].pop('extraParams', None)
 
-                validate(search_engine, schema=SCHEMA)
+            if 'included' in specifier and 'everywhere' in specifier[
+                    'included'] and specifier['included']['everywhere']:
+                general_specifier = specifier
 
-                search_engines.append(search_engine)
+        if not general_specifier:
+            general_specifier = {'included': {'everywhere': True}}
+            search_engine['appliesTo'].insert(0, general_specifier)
+        if search_engine['webExtension']['id'] == cls._DUCKDUCKGO_SEARCH_ENGINE_ID:
+            general_specifier['default'] = 'yes'
 
-        return File(search_config.path, search_engines)
+        return search_engine
 
 
-class TopSites:
+class TippyTopSites:
     JSON_PATHS = (
         arguments.MAIN_PATH /
         'browser/components/newtab/data/content/tippytop/top_sites.json',
@@ -153,15 +178,42 @@ class TopSites:
         'tippytop/top_sites.json')
 
     @classmethod
-    def process(cls, parsed_jsons):
-        main_top_sites = parsed_jsons[0]
-        branding_top_sites = parsed_jsons[1]
-        result = branding_top_sites.content + \
-            [site for site in main_top_sites.content if 'wikipedia.org' in site['domains']]
-        return File(main_top_sites.path, result)
+    def process(cls, parsed_jsons, parsed_schema):
+        tippy_top_sites_main = parsed_jsons[0]
+        tippy_top_sites_branding = parsed_jsons[1]
+        result = tippy_top_sites_branding.content + \
+            [site for site in tippy_top_sites_main.content if 'wikipedia.org' in site['domains']]
+        return File(tippy_top_sites_main.path, result)
+
 
+class TopSites(RemoteSettings):
+    _TOP_SITES_JSON_PATH = 'main/top-sites.json'
+    _TOP_SITES_PATH_MAIN = RemoteSettings.DUMPS_PATH_ABSOLUTE / _TOP_SITES_JSON_PATH
 
-processors = (SearchConfig, TopSites, RemoteSettings)
+    JSON_PATHS = (
+        arguments.BRANDING_PATH /
+        RemoteSettings.DUMPS_PATH_RELATIVE /
+        _TOP_SITES_JSON_PATH,
+        _TOP_SITES_PATH_MAIN)
+    OUTPUT_PATH = _TOP_SITES_PATH_MAIN
+
+    @classmethod
+    def should_modify_collection(cls, collection):
+        return cls._TOP_SITES_PATH_MAIN == collection.path
+
+    @classmethod
+    def should_drop_record(cls, site):
+        return site['url'] != 'https://www.wikipedia.org/'
+
+    @classmethod
+    def process_record(cls, site):
+        site.pop('exclude_regions', None)
+        return site
+
+
+# To reflect the latest timestamps, Changes class should always come after
+# all other RemoteSettings subclasses
+processors = (TippyTopSites, SearchConfig, TopSites, Changes)
 
 for processor in processors:
     parsed_jsons = []
@@ -169,6 +221,11 @@ for processor in processors:
         with json_path.open() as file:
             parsed_jsons.append(File(json_path, json.load(file)))
 
-    processed = processor.process(parsed_jsons)
+    parsed_schema = None
+    if hasattr(processor, "SCHEMA_PATH"):
+        with processor.SCHEMA_PATH.open() as file:
+            parsed_schema = json.load(file)
+
+    processed = processor.process(parsed_jsons, parsed_schema)
     with processed.path.open('w') as file:
         json.dump(processed.content, file, indent=arguments.indent)
-- 
cgit v1.2.3