cockpit/tools/urls-check

#!/usr/bin/python3 -cimport os, sys; os.execv(os.path.dirname(sys.argv[1]) + "/../test/common/pywrap", sys.argv)

# This file is part of Cockpit.
#
# Copyright (C) 2020 Red Hat, Inc.
#
# Cockpit is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
#
# Cockpit is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Cockpit; If not, see <http://www.gnu.org/licenses/>.

import argparse
import fnmatch
import os
import subprocess
import sys
import time
import urllib
import urllib.parse
from urllib.request import Request, urlopen

import task

BASE_DIR = os.path.realpath(f'{__file__}/../..')

DAYS = 7
TASK_NAME = "Validate all URLs"

USER_AGENT = "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0"

KNOWN_REDIRECTS = [
    # fnmatch-like
    "https://access.redhat.com/security/updates/classification/#",
    "https://firefox.com/",
    "https://www.microsoft.com/",
    "https://github.com/patternfly/*"
]


def main():
    parser = argparse.ArgumentParser(description=TASK_NAME)
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
    parser.add_argument('-n', '--dry-run', action="store_true",
                        help="Only show urls")
    opts = parser.parse_args()

    task.verbose = opts.verbose

    if opts.dry_run:
        (success, err) = check_urls(opts.verbose)
        if err:
            print(err)
        sys.exit(1 if err else 0)

    since = time.time() - (DAYS * 86400)

    # When there is an open issue, don't do anything
    issues = task.api.issues(state="open")
    issues = [i for i in issues if i["title"] == TASK_NAME]
    if issues:
        return

    issues = task.api.issues(state="closed", since=since)
    issues = [i for i in issues if i["title"] == TASK_NAME]

    # If related issue was not modified in last n-DAYS, then do your thing
    if not issues:
        (success, err) = check_urls(opts.verbose)
        if err:
            # Create a new issue
            data = {
                "title": TASK_NAME,
                "body": err,
                "labels": ["bot"]
            }
            task.api.post("issues", data)
        else:
            # Try to comment on the last issue (look in the last 4*DAYS)
            # If there is not issue to be found, then just open a new one and close it
            success = success or "Task hasn't produced any output"
            since = time.time() - (DAYS * 4 * 86400)
            issues = task.api.issues(state="closed", since=since)
            issues = [i for i in issues if i["title"] == TASK_NAME]
            if issues:
                task.comment(issues[0], success)
            else:
                data = {
                    "title": TASK_NAME,
                    "body": success,
                    "labels": ["bot"]
                }
                new_issue = task.api.post("issues", data)
                task.api.post("issues/{0}".format(new_issue["number"]), {"state": "closed"})


def check_urls(verbose):
    command = r'git grep -IEho "(https?)://[-a-zA-Z0-9@:%_\+.~#?&=/]+" -- pkg ":!*.svg" | sort -u'
    urls = subprocess.check_output(command, shell=True, universal_newlines=True, cwd=BASE_DIR).split("\n")
    redirects = []
    failed = []
    for url in urls:
        if not url:
            continue

        if urllib.parse.urlparse(url).hostname.endswith(".example.com"):  # Some tests use demo urls
            continue

        if verbose:
            print(f"Checking: {url}")

        try:
            # Specify agent as some websites otherwise block requests
            req = Request(url=url, headers={"User-Agent": USER_AGENT})
            resp = urlopen(req)
            if resp.geturl() != url and not any(fnmatch.fnmatch(url, pattern) for pattern in KNOWN_REDIRECTS):
                redirects.append(url)
            if resp.getcode() >= 400:
                failed.append(f"{url} : {resp.getcode()} {resp.reason}")
        except urllib.error.URLError as e:
            failed.append(f"{url} : {e.code} {e.reason}")

    err = ""
    success = ""
    if failed:
        err = f"Checked {len(urls)} URLs out of which {len(failed)} is/are invalid:\n"
        err += ''.join(f'    {url}\n' for url in failed)
    else:
        success = f"Checked {len(urls)} URLs and all are valid."
        if redirects:
            success += "\nFollowing URLs are redirected:\n"
            success += ''.join(f'    {url}\n' for url in redirects)
    return success, err


if __name__ == '__main__':
    sys.exit(main())