cockpit/test/verify/check-metrics

#!/usr/bin/python3 -cimport os, sys; os.execv(os.path.dirname(sys.argv[1]) + "/../common/pywrap", sys.argv)
# Run this with --help to see available options for tracing and debugging
# See https://github.com/cockpit-project/cockpit/blob/main/test/common/testlib.py
# "class Browser" and "class MachineCase" for the available API.

import re
import time

# import Cockpit's machinery for test VMs and its browser test API
import packagelib
from testlib import (
    Browser,
    Error,
    MachineCase,
    nondestructive,
    skipDistroPackage,
    skipImage,
    skipMobile,
    skipOstree,
    test_main,
    wait,
)

from lib.constants import TEST_OS_DEFAULT
from machine_core import ssh_connection


def getMaximumSpike(test, g_type, saturation, hour, minute):
    # only for minutes with events, which have SVG graphs
    sel = f"#metrics-hour-{hour} div.metrics-minute[data-minute={minute}] div.metrics-data-{g_type} div"
    if saturation:
        sel += ":nth-child(2)"
    else:
        sel += ":first-child"

    points = test.browser.attr(sel, "points")
    xs = [float(x.split(" ")[0].rstrip("%")) for x in points.split(", ") if x != ""]
    test.assertNotIn("NaN", xs)

    return max(xs) / 100


def getCompressedMinuteValue(test, g_type, saturation, hour, minute):
    # only for minutes without events, which only have bars

    polygon_class = ".polygon-sat" if saturation else ".polygon-use"
    sel = f"#metrics-hour-{hour} div.metrics-minute[data-minute={minute}] div.metrics-data-{g_type} .compressed{polygon_class}"
    m = re.search(r"--%s:\s*([0-9.]+);" % (saturation and "saturation" or "utilization"), test.browser.attr(sel, "style"))
    test.assertIsNotNone(m)
    return float(m.group(1))


def progressValue(test, progress_bar_sel):
    sel = progress_bar_sel + " .pf-v5-c-progress__indicator"
    test.browser.wait_visible(sel)
    test.browser.wait_attr_contains(sel, "style", "width:")
    style = test.browser.attr(sel, "style")
    m = re.search(r"width: (\d+)%;", style)
    return int(m.group(1))


def topServiceValue(test, aria_label, col_label, row):
    sel = "table[aria-label='%s'] tbody tr:nth-of-type(%d) td[data-label='%s']" % (aria_label, row, col_label)
    # split off unit, like "12 MB"
    return float(test.browser.text(sel).split(' ')[0])


def prepareArchive(machine, name, time, hostname="localhost.localdomain"):
    machine.upload([f"verify/files/metrics-archives/{name}"], "/tmp/")

    command = f"tar -C / -xzvf /tmp/{name}"
    if name.endswith("zip"):
        command = f"unzip /tmp/{name} -d /"

    machine.execute("""ntp=`timedatectl show --property NTP --value`
                       if [ $ntp == "yes" ]; then
                           timedatectl set-ntp off
                       fi
                       systemctl stop pmlogger
                       # don't let NM set transient host names from DHCP
                       systemctl stop NetworkManager
                       hostnamectl set-hostname {2}
                       rm -rf /var/log/pcp/pmlogger/*
                       {0}
                       # set-ntp off is asynchronous; wait until timesyncd stops before the time can be set
                       while systemctl is-active systemd-timesyncd; do sleep 1; done
                       timedatectl set-time @{1}""".format(command, time, hostname))


def redisService(image):
    if image.startswith("debian") or image.startswith("ubuntu"):
        return "redis-server"
    return "redis"


def applySettings(browser):
    browser.click("#pcp-settings-modal button.pf-m-primary")
    with browser.wait_timeout(30):
        browser.wait_not_present("#pcp-settings-modal")


def login(self):
    # HACK: Ubuntu and Debian need some time until metrics channel is available
    # Really no idea what it needs to wait for, so let's just try channel until it succeeds
    if self.machine.image.startswith("ubuntu") or self.machine.image.startswith("debian"):
        self.login_and_go("/system")
        self.browser.wait(lambda: self.browser.call_js_func("""(function() {
            return new Promise((resolve, reject) => {
                cockpit.spawn(["date", "+%s"])
                    .then(out => {
                        const now = parseInt(out.trim()) * 1000;
                        const current_hour = Math.floor(now / 3600000) * 3600000;
                        const metrics_channel = cockpit.channel({ payload: "metrics1", source: "pcp-archive",
                            interval: 5000, metrics: [{ name: "kernel.all.cpu.nice", derive: "rate" }],
                            timestamp: current_hour, limit: 10 });
                        metrics_channel.addEventListener("close", (ev, error) => {
                            if (error.problem) {
                                console.log("Channel is not ready:", error.problem);
                                resolve(0);
                            } else
                                resolve(1);
                        });
                    });
                });
            })"""))
        self.browser.click("a:contains('View metrics and history')")
        self.browser.enter_page("/metrics")
    else:
        self.login_and_go("/metrics")


@skipDistroPackage()
class TestHistoryMetrics(MachineCase):
    def setUp(self):
        super().setUp()
        # start with a clean slate and avoid running into restart limits
        self.machine.execute("systemctl stop pmlogger pmproxy; systemctl reset-failed pmlogger pmproxy 2>/dev/null || true")
        if self.machine.image == 'debian-stable':
            # HACK: work around pcp breaking permissions: https://bugzilla.redhat.com/show_bug.cgi?id=2013937
            # This is failing in too many ways to meaningfully cover with naughty
            self.machine.execute("chown -R pcp:pcp /var/log/pcp/pmlogger/")

    def waitStream(self, current_max):
        # should only have at most <current_max> valid minutes, the rest should be empty
        valid_start = self.browser.call_js_func("ph_count", ".metrics-data-cpu.valid-data")
        self.assertLessEqual(valid_start, current_max)
        # page auto-updates every minute
        with self.browser.wait_timeout(90):
            self.browser.wait_js_func("(exp => ph_count('.metrics-data-cpu.valid-data') == exp)", valid_start + 1)

        # Should never show more then 4 empty leading minutes (block of 5 minutes but always at least one used)
        leading_empty = self.browser.call_js_func("""(function () {
            const lines = document.getElementsByClassName("metrics-data-cpu");
            let counter = 0;

            Array.from(lines).every(l => {
                if (l.classList.contains("empty-data")) {
                    counter++;
                    return true;
                } else {
                    return false;;
                }
            });

            return counter;
        })""")
        self.assertLessEqual(leading_empty, 4)

    @skipOstree("no PCP support")
    def testBasic(self):
        b = self.browser
        m = self.machine

        m.execute("""ntp=`timedatectl show --property NTP --value`
             if [ $ntp == "yes" ]; then
                 timedatectl set-ntp off
             fi""")
        m.execute("while systemctl is-active systemd-timesyncd; do sleep 1; done")
        m.execute("timedatectl set-time '2020-11-24 09:24:05'")

        # clean slate, to avoid seeing the data from preparing the VM
        m.execute("rm -rf /var/log/pcp/pmlogger/*; systemctl start pmlogger")

        login(self)
        # eventually finishes data loading and shows heading
        b.wait_in_text(".metrics-heading", "CPU")

        # only shows current hour
        b.wait_js_func("ph_count_check", ".metrics-hour", 1)

        # VM just started, we don't have 12 hours of data
        b.wait_in_text(".metrics .pf-v5-c-alert", "No data available between")
        # initial data gap is < 24 hours, does not show date
        year = m.execute("date +%Y").strip()
        self.assertNotIn(year, b.text(".metrics .pf-v5-c-alert"))

        # can try to load earlier data; only updates "no data" alert as there is no data
        b.wait_text(".bottom-panel button", "Load earlier data")
        b.click(".bottom-panel button")
        # now the gap is > 24 hours, does show date
        b.wait_in_text(".metrics .pf-v5-c-alert", year)
        # still only one hour
        b.wait_js_func("ph_count_check", ".metrics-hour", 1)

        self.waitStream(3)

        # Graphs are by default all visible
        b.click("button[aria-label='Graph visibility options menu']")
        b.wait_visible("#column-visibility-option-cpu:checked")
        b.wait_visible(".metrics-label-graph:contains(CPU)")
        b.wait_visible("#column-visibility-option-memory:checked")
        b.wait_visible(".metrics-label-graph:contains(Memory)")
        b.wait_visible("#column-visibility-option-disks:checked")
        b.wait_visible(".metrics-label-graph:contains(Disk I/O)")
        b.wait_visible("#column-visibility-option-network:checked")
        b.wait_visible(".metrics-label-graph:contains(Network)")

        # Change graph visibility
        b.wait_visible(".metrics-events:contains('Network I/O')")
        b.set_checked("#column-visibility-option-network", False)
        b.wait_not_present(".metrics-events:contains('Network I/O')")
        b.wait_not_present(".metrics-label-graph:contains(Network)")
        b.set_checked("#column-visibility-option-network", True)

        # Change date to yesterday, should be empty
        b.click("#date-picker-select-toggle .pf-v5-c-select__toggle-arrow")
        b.click(".pf-v5-c-select__menu-wrapper:nth-child(2) button")
        b.wait_text(".pf-v5-c-empty-state", "No data available")

        # Breadcrumb back to Overview page
        b.click(".pf-v5-c-breadcrumb li:first-child")
        b.enter_page("/system")
        b.wait_visible('.system-information')

    @skipOstree("no PCP support")
    def testEvents(self):
        b = self.browser
        m = self.machine

        b.wait_timeout(60)

        def events_at(hour, minute):
            b.wait_visible(f"#metrics-hour-{hour}.metrics-hour-compressed")
            b.click(f"#metrics-hour-{hour} button.metrics-events-expander")
            events = b.text(f"#metrics-hour-{hour} div.metrics-minute[data-minute={minute}] .metrics-events")
            b.click(f"#metrics-hour-{hour} button.metrics-events-expander")

            return events

        #
        # Disks
        #

        # disable swap, so that we can test current metrics without swap
        m.execute('''systemctl stop "*.swap" "swap-create@*" "systemd-zram-setup@*" || true
                     systemctl mask "swap-create@" "systemd-zram-setup@"
                     swapoff --all
                     while [ -n "$(swapon --show)" ]; do sleep 1; done''')

        prepareArchive(m, "disk.tar.gz", 1597672800)

        login(self)
        # eventually finishes data loading and shows heading
        b.wait_in_text(".metrics-heading", "CPU")

        # Big spike lasting 3 minutes
        self.assertGreaterEqual(getMaximumSpike(self, "disks", False, 1597662000000, 25), 0.9)
        self.assertGreaterEqual(getCompressedMinuteValue(self, "disks", False, 1597662000000, 26), 0.9)
        self.assertGreaterEqual(getCompressedMinuteValue(self, "disks", False, 1597662000000, 27), 0.9)

        # Smaller spike lasting 2 minutes
        self.assertGreaterEqual(getMaximumSpike(self, "disks", False, 1597662000000, 28), 0.4)
        self.assertLessEqual(getMaximumSpike(self, "disks", False, 1597662000000, 28), 0.6)
        self.assertGreaterEqual(getCompressedMinuteValue(self, "disks", False, 1597662000000, 29), 0.4)
        # recognized as event

        self.assertIn("Disk I/O", events_at(1597662000000, 28))

        # No visible activity after that
        self.assertLessEqual(getCompressedMinuteValue(self, "disks", False, 1597662000000, 30), 0.01)

        # swap usage is not shown if there is no swap
        b.wait_visible("#current-memory-usage")
        self.assertFalse(b.is_present("#current-swap-usage"))

        # Check that we don't show too much empty minutes in the first hour
        self.assertLessEqual(b.call_js_func("ph_count", ".metrics-data-cpu"), 35)

        # Check metrics hour header in compressed and expanded mode
        b.click("#metrics-hour-1597662000000 button.metrics-events-expander")
        b.wait_in_text("#metrics-hour-1597662000000:not(.metrics-hour-compressed) .metrics-events-hour-header-expanded time", "1:00")
        b.wait_in_text("#metrics-hour-1597662000000:not(.metrics-hour-compressed) .metrics-events-hour-header-expanded .spikes_count", "3 spikes")
        b.wait_in_text("#metrics-hour-1597662000000:not(.metrics-hour-compressed) .metrics-events-hour-header-expanded .spikes_info", "1 Memory, 1 Disk I/O, 1 Network I/O")

        b.assert_pixels(".metrics", "metrics-history-expanded-hour", ignore=[".spikes_count"])

        b.click("#metrics-hour-1597662000000 button.metrics-events-expander")
        b.wait_in_text("#metrics-hour-1597662000000.metrics-hour-compressed", "1:00")
        b.wait_in_text("#metrics-hour-1597662000000.metrics-hour-compressed .spikes_count", "3 spikes")
        b.wait_in_text("#metrics-hour-1597662000000.metrics-hour-compressed .spikes_info", "1 Memory, 1 Disk I/O, 1 Network I/O")

        b.assert_pixels(".metrics", "metrics-history-compressed-hour", ignore=[".nodata"], skip_layouts=["mobile", "rtl"])

        # Check that events are not visible for compressed hours
        b.wait_not_present("#metrics-hour-1597662000000 div.metrics-minute[data-minute=28] .metrics-events")
        b.click("#metrics-hour-1597662000000 button.metrics-events-expander")
        b.wait_visible("#metrics-hour-1597662000000 div.metrics-minute[data-minute=28] .metrics-events")

        b.logout()

        #
        # Network and CPU
        #

        prepareArchive(m, "cpu_network.tar.gz", 1598918400)

        login(self)
        # eventually finishes data loading and shows heading
        b.wait_in_text(".metrics-heading", "CPU")

        # Test network - Big spike lasting 2 minutes
        self.assertGreaterEqual(getMaximumSpike(self, "network", False, 1598950800000, 3), 0.5)
        self.assertGreaterEqual(getMaximumSpike(self, "network", False, 1598950800000, 4), 0.5)
        # recognized as event
        self.assertIn("Network I/O", events_at(1598950800000, 3))
        # but it's not a new event in minute 4
        self.assertNotIn("Network I/O", events_at(1598950800000, 4))

        # Followed by smaller spike
        self.assertGreaterEqual(getMaximumSpike(self, "network", False, 1598950800000, 5), 0.35)
        self.assertLessEqual(getMaximumSpike(self, "network", False, 1598950800000, 5), 0.5)
        # still not a new spike
        self.assertNotIn("Network I/O", events_at(1598950800000, 5))

        # Followed by virtually no data
        self.assertLessEqual(getCompressedMinuteValue(self, "network", False, 1598950800000, 6), 0.01)

        # Test CPU load - big - small - big spikes
        self.assertGreaterEqual(getMaximumSpike(self, "cpu", False, 1598950800000, 3), 0.9)
        self.assertGreaterEqual(getMaximumSpike(self, "cpu", False, 1598950800000, 4), 0.5)
        self.assertLessEqual(getMaximumSpike(self, "cpu", False, 1598950800000, 4), 0.55)
        self.assertGreaterEqual(getMaximumSpike(self, "cpu", False, 1598950800000, 5), 0.9)
        self.assertIn("CPU", events_at(1598950800000, 2))
        self.assertIn("CPU", events_at(1598950800000, 5))

        # Test CPU saturation - 3 spikes, each 2 minutes (medium, big, small)
        self.assertGreaterEqual(getMaximumSpike(self, "cpu", True, 1598950800000, 3), 0.5)
        self.assertLessEqual(getMaximumSpike(self, "cpu", True, 1598950800000, 3), 0.6)
        self.assertGreaterEqual(getMaximumSpike(self, "cpu", True, 1598950800000, 4), 0.5)
        self.assertLessEqual(getMaximumSpike(self, "cpu", True, 1598950800000, 4), 0.6)

        self.assertGreaterEqual(getMaximumSpike(self, "cpu", True, 1598950800000, 5), 0.8)
        self.assertGreaterEqual(getCompressedMinuteValue(self, "cpu", True, 1598950800000, 6), 0.8)

        self.assertGreaterEqual(getCompressedMinuteValue(self, "cpu", True, 1598950800000, 7), 0.3)
        self.assertLessEqual(getCompressedMinuteValue(self, "cpu", True, 1598950800000, 7), 0.4)
        self.assertGreaterEqual(getCompressedMinuteValue(self, "cpu", True, 1598950800000, 8), 0.3)
        self.assertLessEqual(getCompressedMinuteValue(self, "cpu", True, 1598950800000, 8), 0.4)

        self.assertNotIn("Load", events_at(1598950800000, 2))
        self.assertIn("Load", events_at(1598950800000, 3))
        self.assertNotIn("Load", events_at(1598950800000, 4))
        self.assertIn("Load", events_at(1598950800000, 5))

        b.logout()

        #
        # Memory
        #

        have_swap = m.execute("swapon --show").strip()

        prepareArchive(m, "memory.tar.gz", 1600248000)
        login(self)
        b.wait_in_text(".metrics-heading", "CPU")

        # basic RAM consumption after boot; it's still a network spike, thus event+SVG
        self.assertLessEqual(getMaximumSpike(self, "memory", False, 1600236000000, 44), 0.3)
        self.assertNotIn("Memory", events_at(1600236000000, 44))
        if have_swap:
            self.assertAlmostEqual(getMaximumSpike(self, "memory", True, 1600236000000, 44), 0)
            self.assertNotIn("Swap", events_at(1600236000000, 44))

            # swap event from :46 to :47
            self.assertGreater(getMaximumSpike(self, "memory", True, 1600236000000, 46), 0.9)
            self.assertIn("Swap", events_at(1600236000000, 46))
            # continuous, no new Swap event, but still a Memory+Network event
            self.assertGreater(getMaximumSpike(self, "memory", True, 1600236000000, 47), 0.9)
            self.assertNotIn("Swap", events_at(1600236000000, 47))

        else:
            # If no swap, the column is hidden
            self.assertNotIn(b.text(".metrics-heading"), "Swap")
            b.wait_not_present(".metrics-data-memory .saturation")

        # memory spike in :47
        self.assertGreater(getMaximumSpike(self, "memory", False, 1600236000000, 47), 0.6)
        self.assertIn("Memory", events_at(1600236000000, 47))

        # at :54 the machine is loaded to ~80% so no event even if elevated
        self.assertGreater(getCompressedMinuteValue(self, "memory", False, 1600236000000, 54), 0.8)
        b.wait_not_present("#metrics-hour-1600236000000 div.metrics-minute[data-minute=54] .metrics-events")
        if have_swap:
            self.assertAlmostEqual(getCompressedMinuteValue(self, "memory", True, 1600236000000, 54), 0.0)

        # everything is quiet in :55
        self.assertLess(getCompressedMinuteValue(self, "memory", False, 1600236000000, 55), 0.4)
        if have_swap:
            self.assertAlmostEqual(getCompressedMinuteValue(self, "memory", True, 1600236000000, 55), 0.0)

        b.logout()

        #
        # Check changing of time
        #

        m.execute("timedatectl set-time @1600550674")
        login(self)
        # self.waitStream(3) # FIXME: wait for new data - pcp does not handle time change greatly
        b.wait_text("#date-picker-select-toggle .pf-v5-c-select__toggle-text", "Today")

        b.select_PF4("#date-picker-select-toggle", "Wednesday, September 16, 2020")
        self.assertGreater(getMaximumSpike(self, "memory", False, 1600236000000, 51), 0.5)
        self.assertIn("Memory", events_at(1600236000000, 51))

        # Reload should keep the filters intact
        b.reload()
        b.enter_page("/metrics")
        b.wait_text("#date-picker-select-toggle .pf-v5-c-select__toggle-text", "Wednesday, September 16, 2020")

        b.click("#date-picker-select-toggle")
        b.click(".pf-v5-c-select__menu-item:contains('Today')")
        b.wait_text("#date-picker-select-toggle .pf-v5-c-select__toggle-text", "Today")
        # self.waitStream(4) # FIXME: wait for new data - pcp does not handle time change greatly

        b.logout()

        #
        # Check that for every minute only one event is present
        #

        if self.machine.image == TEST_OS_DEFAULT:  # Debian/Ubuntu is unhappy about this archive, one Fedora test is enough though
            prepareArchive(m, "double_events.zip", 1602345600, "m1.cockpit.lan")
            login(self)
            b.wait_in_text(".metrics-heading", "CPU")
            b.wait_in_text("#metrics-hour-1602334800000", "CPU")
            self.assertTrue(self.browser.call_js_func("""(function () {
                const min_events = document.getElementsByClassName("metrics-events");
                return Array.from(min_events).every(l => {
                    const events = Array.from(l.getElementsByTagName("dd")).map(d => d.innerHTML);
                    return (new Set(events)).size === events.length;
                });
            })"""))

            b.logout()

        #
        # Journal logs
        #

        prepareArchive(m, "with_journal.tar.gz", 1615200500, "m1.cockpit.lan")
        # first check the "no logs found" case
        login(self)
        b.wait_in_text(".metrics-heading", "CPU")
        b.click("#metrics-hour-1615197600000 button.metrics-events-expander")
        b.wait_in_text("#metrics-hour-1615197600000 div.metrics-minute[data-minute=39] .metrics-events span.spikes_info", "Load")

        # Now add the journal
        # Journal was recorded on Fedora 33 and when trying to use it with older systemd it fails with:
        # `Journal file /var/log/journal/*/journal.journal uses an unsupported feature, ignoring file.`

        if self.machine.image in ["centos-8-stream", "rhel-8-7", "rhel-8-8", "rhel-8-9", "debian-stable"]:
            return

        m.upload(["verify/files/metrics-archives/journal.journal.gz"], "/tmp")
        m.execute('''gunzip /tmp/journal.journal.gz
                     cp /tmp/journal.journal /var/log/journal/*/''')
        b.reload()
        b.enter_page("/metrics")

        b.wait_in_text(".metrics-heading", "CPU")
        b.click("#metrics-hour-1615197600000 button.metrics-events-expander")
        b.click("#metrics-hour-1615197600000 div.metrics-minute[data-minute=39] .metrics-events button.spikes_info")
        b.wait_visible(".cockpit-log-message:contains('Created slice cockpittest.slice.')")
        b.wait_in_text(".cockpit-logline:first-child .cockpit-log-message", "cpu-piglet")
        b.click(".cockpit-logline:first-child .cockpit-log-message")
        b.enter_page("/system/logs")
        b.wait_in_text(".pf-v5-c-card__title", "cpu-piglet")
        b.click("li:contains('Logs')")
        b.wait_visible(".cockpit-log-message:contains('Created slice cockpittest.slice.')")

        b.go("/metrics")
        b.enter_page("/metrics")
        # logs exist, should show tight range
        b.click("button:contains('View detailed logs')")
        b.enter_page("/system/logs")
        b.wait_visible(".cockpit-log-message:contains('Created slice cockpittest.slice.')")
        url = b.eval_js('window.location.hash')
        self.assertIn("priority=info", url)
        self.assertIn("since=2021-3-8%2010%3A39%3A0", url)
        self.assertIn("until=2021-3-8%2010%3A39%3A45", url)

    @nondestructive
    @skipOstree("no PCP support")
    def testNoDataEnable(self):
        b = self.browser
        m = self.machine

        m.execute("""mount -t tmpfs tmpfs /var/log/pcp/pmlogger
                     chown -R pcp:pcp /var/log/pcp/pmlogger
                     if selinuxenabled; then restorecon /var/log/pcp/pmlogger; fi""")
        self.addCleanup(m.execute, "systemctl stop pmlogger; until umount /var/log/pcp/pmlogger; do sleep 1; done")

        self.login_and_go("/metrics")

        b.wait_in_text(".pf-v5-c-empty-state", "Metrics history could not be loaded")
        b.wait_in_text(".pf-v5-c-empty-state", "pmlogger.service is not running")

        # enable pmlogger in settings dialog from empty state
        b.click(".pf-v5-c-empty-state button.pf-m-primary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmlogger:not(:checked)")
        b.click("#switch-pmlogger")
        b.wait_visible("#switch-pmlogger:checked")
        applySettings(b)

        m.execute("until systemctl is-active pmlogger; do sleep 1; done")

        # there is a transient "No data available" state, but sometimes it's very short, so don't assert that

        # page auto-updates every minute and starts to receive data
        with self.browser.wait_timeout(90):
            self.browser.wait_js_cond("ph_count('.metrics-data-cpu.valid-data') >= 1")
        b.wait_not_present(".pf-v5-c-empty-state")

        b.logout()

    @nondestructive
    @skipOstree("no PCP support")
    def testNoDataFailed(self):
        b = self.browser
        m = self.machine

        m.write("/run/systemd/system/pmlogger.service.d/break.conf", "[Service]\nExecStart=\nExecStart=/bin/false")
        m.execute(r"""mount -t tmpfs tmpfs /var/log/pcp/pmlogger
                      if selinuxenabled; then restorecon /var/log/pcp/pmlogger; fi
                      systemctl daemon-reload
                      systemctl start pmlogger || true""")
        self.addCleanup(m.execute,
                        """rm -r /run/systemd/system/pmlogger.service.d/
                        umount /var/log/pcp/pmlogger
                        systemctl daemon-reload""")

        self.login_and_go("/metrics")

        b.wait_in_text(".pf-v5-c-empty-state", "Metrics history could not be loaded")
        b.wait_in_text(".pf-v5-c-empty-state", "pmlogger.service has failed")

        # Troubleshoot
        b.click(".pf-v5-c-empty-state button.pf-m-link")
        b.enter_page("/system/services")
        b.wait_in_text("#service-details", "pmlogger.service")

    @nondestructive
    @skipOstree("no PCP support")
    def testLoggerSettings(self):
        b = self.browser
        m = self.machine

        # start in defined state
        m.execute("systemctl enable --now pmlogger")
        self.addCleanup(m.execute, "systemctl disable --now pmlogger")

        self.login_and_go("/metrics")

        # disable pmlogger in settings dialog from header bar
        b.click("#metrics-header-section button.pf-m-secondary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmlogger:checked")
        b.click("#switch-pmlogger")
        b.wait_visible("#switch-pmlogger:not(:checked)")
        applySettings(b)

        self.assertEqual(m.execute("systemctl is-active pmlogger || true").strip(), "inactive")
        self.assertEqual(m.execute("systemctl is-enabled pmlogger || true").strip(), "disabled")

        # enable pmlogger in settings dialog from header bar
        b.click("#metrics-header-section button.pf-m-secondary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmlogger:not(:checked)")
        b.click("#switch-pmlogger")
        b.wait_visible("#switch-pmlogger:checked")
        applySettings(b)

        m.execute("until systemctl is-active pmlogger; do sleep 1; done")
        self.assertEqual(m.execute("systemctl is-enabled pmlogger").strip(), "enabled")

    @nondestructive
    @skipOstree("no PCP support")
    def testPmProxySettings(self):
        b = self.browser
        m = self.machine

        m.execute("systemctl start firewalld")

        # Arch Linux has no active zone by default which the firewalld port alert test requires.
        if m.image == "arch":
            m.execute("firewall-cmd --zone=public --change-interface eth0 --permanent")
            m.execute("firewall-cmd --reload")

        redis = redisService(m.image)
        hostname = m.execute("hostname").strip()

        self.addCleanup(m.execute, f"systemctl stop {redis}")

        def checkEnable(firewalld_alert):
            b.click("#metrics-header-section button.pf-m-secondary")
            b.wait_visible("#pcp-settings-modal")
            b.wait_visible("#switch-pmproxy:not(:checked)")
            b.click('#switch-pmproxy')
            b.wait_visible('#switch-pmproxy:checked')
            applySettings(b)
            if firewalld_alert:
                b.wait_visible(".pf-v5-c-alert:contains(pmproxy)")
            else:
                b.wait_not_present(".pf-v5-c-alert:contains(pmproxy)")
            m.execute('while [ $(systemctl is-active pmproxy) = activating ]; do sleep 1; done')
            self.assertEqual(m.execute("systemctl is-active pmproxy").strip(), "active")
            self.assertEqual(m.execute(f"systemctl is-active {redis}").strip(), "active")
            self.assertEqual(m.execute("systemctl is-enabled pmproxy").strip(), "enabled")
            self.assertIn("redis", m.execute("systemctl show -p Wants --value pmproxy").strip())
            wait(lambda: hostname in m.execute("curl --max-time 10 --silent --show-error 'http://localhost:44322/series/labels?names=hostname'"), delay=10, tries=30)

        def checkDisable():
            b.click("#metrics-header-section button.pf-m-secondary")
            b.wait_visible("#pcp-settings-modal")
            b.wait_visible('#switch-pmproxy:checked')
            b.click('#switch-pmproxy')
            b.wait_visible("#switch-pmproxy:not(:checked)")
            applySettings(b)
            # always clears the firewalld alert
            b.wait_not_present(".pf-v5-c-alert:contains(pmproxy)")
            self.assertEqual(m.execute("! systemctl is-active pmproxy").strip(), "inactive")
            self.assertEqual(m.execute("! systemctl is-enabled pmproxy").strip(), "disabled")
            # keeps redis running, it's a shared service
            self.assertEqual(m.execute(f"systemctl is-active {redis}").strip(), "active")
            # but drops the pmproxy dependency
            self.assertNotIn("redis", m.execute("systemctl show -p Wants --value pmproxy").strip())
            m.execute("! curl --silent --show-error --max-time 10 'http://localhost:44322/series/labels?names=hostname' 2>&1")

        # start in a defined state; all test images have pcp and redis pre-installed
        m.execute(f"systemctl disable --now pmlogger pmie pmproxy {redis}")
        m.execute("systemctl reset-failed")
        # ensure pmproxy is not already opened in firewall
        m.execute("firewall-cmd --remove-service pmproxy; firewall-cmd --permanent --remove-service pmproxy")
        self.login_and_go("/metrics")

        # pmproxy can't be enabled without pmlogger
        b.click("#metrics-header-section button.pf-m-secondary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmlogger:not(:checked)")
        b.wait_visible("#switch-pmproxy:not(:checked)")
        b.wait_visible("#switch-pmproxy:disabled")
        # enable pmlogger
        b.click('#switch-pmlogger')
        b.wait_visible('#switch-pmlogger:checked')
        applySettings(b)
        m.execute('while [ $(systemctl is-active pmlogger) = activating ]; do sleep 1; done')
        self.assertEqual(m.execute("systemctl is-active pmlogger").strip(), "active")
        b.wait_not_present(".pf-v5-c-alert:contains(pmproxy)")

        checkEnable(True)
        checkDisable()

        # redis already running
        m.execute(f"systemctl start {redis}")
        checkEnable(True)
        checkDisable()

        # pmproxy already running; 44322 queries hang without redis and until restart
        m.execute(f"systemctl disable --now {redis}; systemctl start pmproxy")
        checkEnable(True)

        # without firewalld
        m.execute("firewall-cmd --remove-service pmproxy; firewall-cmd --permanent --remove-service pmproxy")
        m.execute("systemctl stop firewalld")
        self.allow_journal_messages(".*org.fedoraproject.FirewallD1.*disconnected.*")
        checkDisable()
        checkEnable(False)
        m.execute("systemctl start firewalld")

        # Go to firewall page from alert
        checkDisable()
        checkEnable(True)
        b.click(".pf-v5-c-alert button.pf-m-link")
        b.enter_page("/network/firewall")
        b.wait_visible("#firewall-heading")
        b.go("/metrics")
        b.enter_page("/metrics")

        # add pmproxy to default zone directly in alert
        default_zone = m.execute("firewall-cmd --get-default-zone").strip()
        b.wait_text("#firewalld-request-pmproxy", default_zone)
        b.click(".pf-v5-c-alert button.pf-m-primary")
        b.wait_not_present(".pf-v5-c-alert:contains(pmproxy)")
        self.assertIn("pmproxy", m.execute("firewall-cmd --list-services").strip())
        self.assertIn("pmproxy", m.execute("firewall-cmd --list-services --permanent").strip())

        # now service is already enabled, does not show alert
        checkDisable()
        checkEnable(False)

        # firewalld service enabled in permanent config already, does not trip over ALREADY_ENABLED
        checkDisable()
        m.execute("firewall-cmd --remove-service pmproxy")
        checkEnable(True)
        b.click(".pf-v5-c-alert button.pf-m-primary")
        b.wait_not_present(".pf-v5-c-alert:contains(pmproxy)")
        self.assertIn("pmproxy", m.execute("firewall-cmd --list-services").strip())

        # error during zone addition: zone disappears underneath us
        checkDisable()
        m.execute("""set -eux
                     firewall-cmd --permanent --remove-service pmproxy
                     firewall-cmd --permanent --new-zone=comeandgo
                     systemctl start NetworkManager
                     nmcli con add type dummy con-name fake ifname fake0 ip4 1.2.3.4/24
                     firewall-cmd --permanent --zone public --remove-interface fake0
                     firewall-cmd --permanent --zone comeandgo --add-interface fake0
                     firewall-cmd --reload
                  """)
        self.addCleanup(m.execute, "nmcli con delete fake; firewall-cmd --permanent --delete-zone comeandgo || true; firewall-cmd  --reload")
        checkEnable(True)
        b.select_PF4("#firewalld-request-pmproxy", "comeandgo")
        m.execute("firewall-cmd --permanent --delete-zone comeandgo; firewall-cmd  --reload")
        b.click(".pf-v5-c-alert button.pf-m-primary")
        b.wait_in_text(".pf-v5-c-alert.pf-m-warning", "Failed to enable pmproxy in firewalld")
        b.wait_in_text(".pf-v5-c-alert.pf-m-warning", "INVALID_ZONE: comeandgo")
        # close warning
        b.click(".pf-v5-c-alert.pf-m-warning button.pf-m-plain")
        b.wait_not_present(".pf-v5-c-alert:contains(pmproxy)")

        # reacts to service changes from outside; this is asynchronous and the dialog deliberately
        # does not update automatically, so retry a few times
        def checkEnabled(expected):
            for retry in range(10):
                b.click("#metrics-header-section button.pf-m-secondary")
                b.wait_visible('#switch-pmproxy')
                found = b.is_present("#switch-pmproxy" + (expected and ":checked" or ":not(:checked)"))
                b.click("#pcp-settings-modal button.btn-cancel")
                b.wait_not_present("#pcp-settings-modal")

                if found:
                    break
                time.sleep(1)
            else:
                raise Error("PCP settings dialog did not get expected value")

        m.execute(f"systemctl stop {redis}")
        checkEnabled(False)
        m.execute(f"systemctl start {redis}")
        checkEnabled(True)
        m.execute("systemctl stop pmproxy")
        checkEnabled(False)
        m.execute("systemctl start pmproxy")
        checkEnabled(True)


@skipDistroPackage()
@nondestructive
class TestCurrentMetrics(MachineCase):
    def setUp(self):
        super().setUp()
        # packagekit/dnf often eats a lot of CPU; silence it to have better control over CPU usage
        packagekitd = "/usr/lib/packagekitd" if self.machine.image == "arch" else "/usr/libexec/packagekitd"
        self.machine.execute(f"systemctl mask packagekit && killall -9 {packagekitd} && killall -9 dnf || true")

        self.addCleanup(self.machine.execute, "systemctl unmask packagekit")
        # make sure to clean up our test resource consumers on failures
        self.addCleanup(self.machine.execute, "systemctl stop cockpittest.slice 2>/dev/null || true")
        self.addCleanup(self.machine.execute, "su - admin -c 'XDG_RUNTIME_DIR=/run/user/$(id -u admin) "
                                              "systemctl --user stop cockpittest.slice 2>/dev/null || true'")

        self.busybox_image = self.machine.execute("podman images --format '{{.Repository}}' | grep busybox").strip()
        login(self)

    def testCPU(self):
        b = self.browser
        m = self.machine

        b.wait_timeout(60)

        nproc = m.execute("nproc").strip()
        b.wait_in_text("#current-cpu-usage", nproc + " CPU")
        # top CPU core is not visible with just 1 core; our upstream test VMs have only 1 core,
        # but let's not just assume this for downstream gating/custom VMs
        if nproc == '1':
            self.assertFalse(b.is_present("#current-top-cpu-usage"))
            b.wait_text("#current-cpu-usage-description", "1 CPU")
        else:
            b.wait_visible("#current-top-cpu-usage")

        # wait until system settles down
        b.wait(lambda: progressValue(self, "#current-cpu-usage") < 20)
        m.execute("systemd-run --collect --slice cockpittest -p CPUQuota=60% --unit cpu-hog dd if=/dev/urandom of=/dev/null")
        m.execute("systemd-run --collect --slice cockpittest -p CPUQuota=30% --unit cpu-piglet dd if=/dev/urandom of=/dev/null")
        b.wait(lambda: progressValue(self, "#current-cpu-usage") > 75)
        # no other process in the test VM should take > 30% CPU, by the "settles down" assertion above
        b.wait_text("table[aria-label='Top 5 CPU services'] tbody tr:nth-of-type(1) td[data-label='Service']", "cpu-hog")
        b.wait_text("table[aria-label='Top 5 CPU services'] tbody tr:nth-of-type(2) td[data-label='Service']", "cpu-piglet")

        # There might be some other processes which take more resources
        # Keep this logging so we can easily debug which ones we might need to cleanup
        try:
            b.wait(lambda: topServiceValue(self, "Top 5 CPU services", "%", 1) > 50)
            b.wait(lambda: topServiceValue(self, "Top 5 CPU services", "%", 1) < 70)
            b.wait(lambda: topServiceValue(self, "Top 5 CPU services", "%", 2) > 20)
            b.wait(lambda: topServiceValue(self, "Top 5 CPU services", "%", 2) < 40)
        except BaseException:
            print(m.execute("top -b -n 1"))
            raise

        m.execute("systemctl stop cpu-hog cpu-piglet")
        # should go back to idle usage
        b.wait(lambda: progressValue(self, "#current-cpu-usage") < 20)
        # it could be that the table disappears completely if no service has a noticeable CPU usage;
        # so don't assume the table exists
        b.wait_not_in_text("#current-metrics-card-cpu", "cpu-hog")
        b.wait_not_in_text("#current-metrics-card-cpu", "cpu-piglet")

        # Load is a flex, each part looks like "1 min: 1.41,"; wait until the 1min load is low
        b.wait(lambda: float(b.text("#load-avg .pf-v5-l-flex div:first-child").split()[-1].rstrip(',')) < 5)

        m.execute("systemd-run --collect --slice cockpittest --unit load-hog sh -ec "
                  "  'for i in `seq 500`; do dd if=/dev/urandom of=/dev/zero bs=100K count=500 status=none & done'")
        b.wait(lambda: float(b.text("#load-avg .pf-v5-l-flex div:first-child").split()[-1].rstrip(',')) > 15)
        m.execute("systemctl stop load-hog 2>/dev/null || true")  # ok to fail, as the command exits by itself

        container_name = "pod-cpu-hog"
        m.execute(f"podman run --rm -d --name {container_name} {self.busybox_image} /bin/dd if=/dev/urandom of=/dev/null")

        container_sha = m.execute(f"podman inspect --format '{{{{.Id}}}}' {container_name}").strip()
        shortid = container_sha[:12]

        # On some test images the container takes a while to show up
        with b.wait_timeout(300):
            b.wait_in_text("#current-metrics-card-cpu", f"pod {shortid}")
        b.wait(lambda: topServiceValue(self, "Top 5 CPU services", "%", 1) > 70)

        # It takes one re-render for the name lookup
        with b.wait_timeout(30):
            b.wait_in_text("#current-metrics-card-cpu", f"pod {container_name}")

        m.execute(f"podman stop -t 0 {container_name}")

        # RHEL-8 / CentOS-8's podman user containers do not show up as
        # libpod-$containerid but as podman-3679.scope.
        if m.image != "centos-8-stream" and not m.image.startswith("rhel-8"):
            # copy images for user podman tests; podman insists on user session
            m.execute(f"podman save {self.busybox_image} | sudo -i -u admin podman load")

            # Test user containers
            admin_s = ssh_connection.SSHConnection(user="admin",
                                                   address=m.ssh_address,
                                                   ssh_port=m.ssh_port,
                                                   identity_file=m.identity_file)
            user_container_name = "user-cpu-hog"
            admin_s.execute(f"podman run --rm -d --name {user_container_name} {self.busybox_image} /bin/dd if=/dev/urandom of=/dev/null")

            container_sha = admin_s.execute(f"podman inspect --format '{{{{.Id}}}}' {user_container_name}").strip()
            shortid = container_sha[:12]

            # On some test images the container takes a while to show up
            with b.wait_timeout(300):
                b.wait_in_text("#current-metrics-card-cpu", f"pod {shortid}")
            b.wait(lambda: topServiceValue(self, "Top 5 CPU services", "%", 1) > 70)

            # It takes one re-render for the name lookup
            with b.wait_timeout(30):
                b.wait_in_text("#current-metrics-card-cpu", f"pod {user_container_name}")

            admin_s.execute(f"podman stop -t 0 {user_container_name}")

        # this settles down slowly, don't wait for becoming really quiet
        with b.wait_timeout(300):
            b.wait(lambda: float(b.text("#load-avg .pf-v5-l-flex div:first-child").split()[-1].rstrip(',')) < 10)

        # Files with CPU temperature do not exist, nothing is displayed
        b.wait_not_present("#current-metrics-card-cpu .temperature")

        # No matching type
        self.addCleanup(m.execute, "rm -rf /tmp/sensor-sys-class")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon0/name", "BAT0")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon0/temp1_input", "40000")
        m.execute("mount -o bind /tmp/sensor-sys-class /sys/class")
        self.addCleanup(m.execute, "umount /sys/class")
        b.logout()
        self.login_and_go("/metrics")

        b.wait_not_present("#current-metrics-card-cpu .temperature")

        # create files that contain CPU temperature
        # ARM
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/name", "cpu_thermal")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "30000")

        b.logout()
        self.login_and_go("/metrics")

        b.wait_in_text("#current-metrics-card-cpu", "30 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "45000")
        b.wait_in_text("#current-metrics-card-cpu", "45 °C")

        # AMD
        m.execute("rm -rf /tmp/sensor-sys-class/hwmon/hwmon1/*")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/name", "k10temp")
        # Tctl (temp1_input) will be ignored
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_label", "Tctl")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "40000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_max", "100000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_label", "Tccd1")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "35000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_max", "100000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp3_label", "Tccd3")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp3_input", "30000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp3_max", "100000")

        b.logout()
        self.login_and_go("/metrics")

        b.wait_in_text("#current-metrics-card-cpu", "35 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp3_input", "55000")
        b.wait_in_text("#current-metrics-card-cpu", "55 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "90000")
        b.wait_visible("#current-metrics-card-cpu .text-color-warning")
        b.wait_in_text("#current-metrics-card-cpu .text-color-warning", "90 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "45000")
        # temp2_input cooled down, temp3_input is the hottest again
        b.wait_in_text("#current-metrics-card-cpu", "55 °C")

        # atk0110 motherboard
        m.execute("rm -rf /tmp/sensor-sys-class/hwmon/hwmon1/*")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/name", "atk0110")
        # MB Temperature (temp2_label) will be ignored
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_label", "CPU Temperature")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "50000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_label", "MB Temperature")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "70000")

        b.logout()
        self.login_and_go("/metrics")

        b.wait_in_text("#current-metrics-card-cpu", "50 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "95000")
        b.wait_visible("#current-metrics-card-cpu .text-color-critical")
        b.wait_in_text("#current-metrics-card-cpu .text-color-critical", "95 °C")
        # cooled down a little, warning color changes from red to yellow
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "85000")
        b.wait_visible("#current-metrics-card-cpu .text-color-warning")
        b.wait_in_text("#current-metrics-card-cpu .text-color-warning", "85 °C")

        # intel
        m.execute("rm -rf /tmp/sensor-sys-class/hwmon/hwmon1/*")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/name", "coretemp")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_label", "Package id 0")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "60000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_crit", "100000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_label", "Core 0")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "50000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_crit", "100000")

        b.logout()
        self.login_and_go("/metrics")

        b.wait_in_text("#current-metrics-card-cpu", "60 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "85000")
        b.wait_visible("#current-metrics-card-cpu .text-color-warning")
        b.wait_in_text("#current-metrics-card-cpu .text-color-warning", "85 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp2_input", "70000")
        # cooled down, warning color is not visible
        b.wait_not_present("#current-metrics-card-cpu .text-color-warning")
        b.wait_in_text("#current-metrics-card-cpu", "70 °C")

        # add second CPU
        m.write("/tmp/sensor-sys-class/hwmon/hwmon2/name", "coretemp")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon2/temp1_label", "Package id 0")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon2/temp1_input", "60000")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon2/temp2_label", "Core 0")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon2/temp2_input", "75000")

        b.logout()
        self.login_and_go("/metrics")

        # CPU 2 is the hottest
        b.wait_in_text("#current-metrics-card-cpu", "75 °C")
        m.write("/tmp/sensor-sys-class/hwmon/hwmon2/temp1_input", "80000")
        b.wait_in_text("#current-metrics-card-cpu", "80 °C")
        # CPU 1 is the hottest again
        m.write("/tmp/sensor-sys-class/hwmon/hwmon1/temp1_input", "90000")
        b.wait_in_text("#current-metrics-card-cpu", "90 °C")

        # Test link to user services
        # older releases don't have CPU accounting enabled for user services
        if m.image not in ["rhel-8-7", "rhel-8-8", "rhel-8-9", "centos-8-stream"]:
            m.execute("su - admin -c 'XDG_RUNTIME_DIR=/run/user/$(id -u admin) systemd-run --user --collect --slice cockpittest -p CPUQuota=60% --unit cpu-userhog dd if=/dev/urandom of=/dev/null'")
            # user services are always running underneath user@1000.service, so these two will compete for row 1 or 2
            b.wait_in_text("table[aria-label='Top 5 CPU services'] tbody", "cpu-userhog")
            b.click("table[aria-label='Top 5 CPU services'] tbody tr:contains(cpu-userhog) td[data-label='Service'] a span")
            b.enter_page("/system/services")
            b.wait_in_text(".service-name", "/usr/bin/dd if=/dev/urandom of=/dev/null")

    def testMemory(self):
        b = self.browser
        m = self.machine
        # only some images have swap
        have_swap = m.execute("swapon --show").strip()
        # wait until RAM usage is initialized
        b.wait(lambda: progressValue(self, "#current-memory-usage") > 10)

        # our test machines should use a reasonable chunk of available memory
        initial_usage = progressValue(self, "#current-memory-usage")
        self.assertGreater(initial_usage, 10)
        self.assertLess(initial_usage, 80)
        # allocate a chunk of memory; this may cause other stuff to get unmapped,
        # thus not exact addition, but usage should go up
        size = 300 if have_swap else 200  # MB
        self.write_file("/usr/local/bin/memhog.sh", f"""#!/usr/bin/awk -f
BEGIN {{
    x = sprintf("%{size}000000s","");
    system("touch /tmp/hogged; sleep infinity")
}}""", perm="755")

        m.execute("systemd-run --collect --slice cockpittest --unit mem-hog memhog.sh")
        m.execute("while [ ! -e /tmp/hogged ]; do sleep 1; done")
        # bars update every 3s
        time.sleep(8)
        hog_usage = progressValue(self, "#current-memory-usage")
        self.assertGreater(hog_usage, initial_usage + 8)

        b.wait_text("table[aria-label='Top 5 memory services'] tbody tr:nth-of-type(1) td[data-label='Service']", "mem-hog")
        b.wait(lambda: topServiceValue(self, "Top 5 memory services", "Used", 1) > size)
        b.wait(lambda: topServiceValue(self, "Top 5 memory services", "Used", 1) < size + 50)

        # total memory is shown as tooltip
        b.mouse("#current-memory-usage", "mouseenter")
        b.wait_in_text(".pf-v5-c-tooltip", "B total")
        b.mouse("#current-memory-usage", "mouseleave")

        # table entries are links to Services page
        b.click("table[aria-label='Top 5 memory services'] tbody tr:nth-of-type(1) td[data-label='Service'] a span")
        b.enter_page("/system/services")
        b.wait_in_text("#path", "/mem-hog.service")
        b.wait_in_text(".service-name", "memhog.sh")

        b.go("/metrics")
        b.enter_page("/metrics")
        b.wait_visible("table[aria-label='Top 5 memory services']")

        if have_swap:
            usage_hog1 = progressValue(self, "#current-memory-usage")

            # use even more memory to trigger swap
            m.execute("systemd-run --collect --slice cockpittest --unit mem-hog2 awk "
                      """'BEGIN { x = sprintf("%700000000s",""); system("sleep infinity") }'""")
            b.wait(lambda: progressValue(self, "#current-swap-usage") > 0)

            m.execute("systemctl stop mem-hog mem-hog2")

            # after stopping both hogs, usage should go down
            b.wait(lambda: progressValue(self, "#current-memory-usage") < usage_hog1)
            self.assertGreater(progressValue(self, "#current-memory-usage"), 10)
            b.wait_not_in_text("table[aria-label='Top 5 memory services'] tbody", "mem-hog")

            # total swap is shown as tooltip
            b.mouse("#current-swap-usage", "mouseenter")
            b.wait_in_text(".pf-v5-c-tooltip", "B total")
            b.mouse("#current-swap-usage", "mouseleave")
        else:
            m.execute("systemctl stop mem-hog")

        m.execute("rm /tmp/hogged")

        # Test Podman containers
        container_name = "pod-mem-hog"
        # pipe to tail to keep the data in memory
        m.execute(f"""
            podman run --rm -d --name {container_name} {self.busybox_image} /bin/sh -c '
            head -c 300m /dev/zero | tail | sleep infinity'""")

        # It takes one re-render for the name lookup
        with b.wait_timeout(30):
            b.wait_text("table[aria-label='Top 5 memory services'] tbody tr:nth-of-type(1) td[data-label='Service']", f"pod {container_name}")

        m.execute(f"podman stop -t 0 {container_name}")

        # RHEL-8 / CentOS-8's podman user containers do not show up as
        # libpod-$containerid but as podman-3679.scope.
        if m.image != "centos-8-stream" and not m.image.startswith("rhel-8"):
            # copy images for user podman tests; podman insists on user session
            m.execute(f"podman save {self.busybox_image} | sudo -i -u admin podman load")

            # Test user containers
            admin_s = ssh_connection.SSHConnection(user="admin",
                                                   address=m.ssh_address,
                                                   ssh_port=m.ssh_port,
                                                   identity_file=m.identity_file)
            user_container_name = "user-mem-hog"
            admin_s.execute(f"""
                podman run --rm -d --name {user_container_name} {self.busybox_image} /bin/sh -c '
                head -c 300m /dev/zero | tail | sleep infinity'
            """)

            # It takes one re-render for the name lookup
            with b.wait_timeout(30):
                b.wait_text("table[aria-label='Top 5 memory services'] tbody tr:nth-of-type(2) td[data-label='Service']", f"pod {user_container_name}")

            admin_s.execute(f"podman stop -t 0 {user_container_name}")

        # Test link to user services
        # older releases don't have memory accounting enabled for user services
        if m.image not in ["rhel-8-7", "rhel-8-8", "rhel-8-9", "centos-8-stream"]:
            m.execute("su - admin -c 'XDG_RUNTIME_DIR=/run/user/$(id -u admin) systemd-run --user --collect --slice cockpittest --unit mem-userhog memhog.sh'")
            m.execute("while [ ! -e /tmp/hogged ]; do sleep 1; done")
            # user services are always running underneath user@1000.service, so these two will compete for row 1 or 2
            b.wait_in_text("table[aria-label='Top 5 memory services'] tbody", "mem-userhog")
            b.click("table[aria-label='Top 5 memory services'] tbody tr:contains(mem-userhog) td[data-label='Service'] a span")
            b.enter_page("/system/services")
            b.wait_in_text(".service-name", "memhog.sh")

    def testDiskIO(self):
        b = self.browser
        m = self.machine
        login(self)

        b.wait_timeout(60)

        # test env should be quiet enough to not transmit MB/s
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("#current-disks-read")))
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("#current-disks-write")))
        # reading lots of data
        m.execute("systemd-run --collect --slice cockpittest --unit disk-read-hog sh -ec 'while true; do echo 3 > /proc/sys/vm/drop_caches; grep -r . /usr >/dev/null; done'")
        b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("#current-disks-read")))
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("#current-disks-write")))  # this should stay calm
        # read in popover
        b.click("#current-metrics-card-disks .all-disks-no-gap button")
        b.wait_visible(".pf-v5-c-popover .disks-nowrap")
        b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("[aria-label='Disks usage'] [device-name='vda'] [data-label='Read']")))
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("[aria-label='Disks usage'] [device-name='vda'] [data-label='Write']")))  # write should stay calm
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("[aria-label='Disks usage'] [device-name='sr0'] [data-label='Read']")))  # other disks should stay calm
        # top service should be disk-read-hog
        # unsupported on rhel 8 and centos 8 as they use cgroupv1
        if m.image != "centos-8-stream" and not m.image.startswith("rhel-8"):
            b.wait_text_matches("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Service']", "disk-read-hog")
            b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Read']")))
            b.wait(lambda: re.match(r'^0|([0-9.]+ (kB|B)/s)$', b.text("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Write']")))  # this should stay calm

        m.execute("systemctl stop disk-read-hog")
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("[aria-label='Disks usage'] [device-name='vda'] [data-label='Read']")))  # back to quiet
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("#current-disks-read")))  # back to quiet
        b.click(".pf-v5-c-popover__close > button")
        # writing lots of data
        m.execute("systemd-run --collect --slice cockpittest --unit disk-write-hog sh -ec "
                  " 'while true; do dd if=/dev/zero of=/var/tmp/blob bs=1M count=100; done'")
        self.addCleanup(m.execute, "rm -f /var/tmp/blob")
        b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("#current-disks-write")))
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("#current-disks-read")))  # this should stay calm
        # write in popover
        b.click("#current-metrics-card-disks .all-disks-no-gap button")
        b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("[aria-label='Disks usage'] [device-name='vda'] [data-label='Write']")))
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("[aria-label='Disks usage'] [device-name='vda'] [data-label='Read']")))  # read should stay calm
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("[aria-label='Disks usage'] [device-name='sr0'] [data-label='Write']")))  # other disks should stay calm
        # top service should be disk-write-hog
        # unsupported on rhel 8 and centos 8 as they use cgroupv1
        if m.image != "centos-8-stream" and not m.image.startswith("rhel-8"):
            b.wait_text_matches("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Service']", "disk-write-hog")
            b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Write']")))
            b.wait(lambda: re.match(r'^0|([0-9.]+ (kB|B)/s)$', b.text("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Read']")))  # this should stay calm

        m.execute("systemctl stop disk-write-hog")
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("[aria-label='Disks usage'] [device-name='vda'] [data-label='Write']")))  # back to quiet
        b.wait(lambda: re.match(r'^(0|[0-9.]+ (kB|B)/s)$', b.text("#current-disks-write")))  # back to quiet
        b.click(".pf-v5-c-popover__close > button")
        # top service should be podman container busybox-write-hog
        m.execute(f"podman run --rm -d --name busybox-write-hog {self.busybox_image} /bin/ash -c 'while true; do dd if=/dev/urandom of=/testfile bs=20M count=100; done'")
        self.addCleanup(m.execute, "podman rm -f busybox-write-hog || true")
        b.wait_text_matches("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Service']", "pod busybox-write-hog")
        b.wait(lambda: re.match(r'^[0-9.]+ (MB|GB)/s$', b.text("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Write']")))
        b.wait(lambda: re.match(r'^0|([0-9.]+ (kB|B)/s)$', b.text("table[aria-label='Top 5 disk usage services'] tr:first-child td[data-label='Read']")))  # this should stay calm
        m.execute('podman stop busybox-write-hog')

        # Disk usage

        # add 50 MB loopback disk; mount it once rw and once ro
        m.execute("""set -e
                  F=$(mktemp /var/tmp/loop.XXXX)
                  dd if=/dev/zero of=$F bs=1M count=50
                  mkfs -t ext3 $F
                  mkdir -p /var/cockpittest /var/cockpit-ro-test
                  mount -o loop $F /var/cockpittest
                  RODEV=$(losetup -f --show $F)
                  mount -r $RODEV /var/cockpit-ro-test
                  losetup -d $RODEV
                  rm $F
                  """)
        self.addCleanup(m.execute, "umount /var/cockpittest /var/cockpit-ro-test")

        self.assertLess(progressValue(self, ".pf-v5-c-progress[data-disk-usage-target='/var/cockpittest']"), 5)
        progress_sel = ".pf-v5-c-progress[data-disk-usage-target='/var/cockpittest'] .pf-v5-c-progress__status"
        # free size is anything between 40 and 50 MB
        self.assertRegex(b.text(progress_sel), r"^4\d\.\d MB free$")
        # total size is shown in tooltip
        b.mouse(progress_sel, "mouseenter")
        b.wait_in_text(".pf-v5-c-tooltip", "total")
        # total size is anything between 40 and 50 MB
        self.assertRegex(b.text(".pf-v5-c-tooltip"), r"^4\d\.\d MB total$")
        b.mouse(progress_sel, "mouseleave")
        # read-only loop devices are not shown
        self.assertFalse(b.is_present(".pf-v5-c-progress[data-disk-usage-target='/var/cockpit-ro-test']"))

        m.execute("dd if=/dev/zero of=/var/cockpittest/blob bs=1M count=40")
        b.wait(lambda: progressValue(self, ".pf-v5-c-progress[data-disk-usage-target='/var/cockpittest']") >= 90)

        # clicking on progress leads to the storage page
        if not m.ostree_image:
            self.assertTrue(b.is_present("#current-disks-usage button"))
            b.click(progress_sel)
            b.enter_page("/storage")
            # weird -- storage page does not show transient mount points, only permanent ones; so check for the device
            dev = m.execute("findmnt --noheadings -o SOURCE /var/cockpittest").strip()
            b.wait_in_text("#mounts", dev)

            b.go("/metrics")
            b.enter_page("/metrics")
            b.wait_visible(progress_sel)
            b.logout()

            # without cockpit-storaged, mounts are not links
            self.restore_file("/usr/share/cockpit/storaged/manifest.json")
            m.write("/usr/share/cockpit/storaged/manifest.json", "")
            self.allow_journal_messages("storaged: couldn't read manifest.json: JSON data was empty")
            login(self)
        b.wait_visible(progress_sel)
        self.assertFalse(b.is_present("#current-disks-usage button"))

    @skipOstree("no netcat on CoreOS")
    def testNetwork(self):
        b = self.browser
        m = self.machine

        # add synthetic veth which is guaranteed quiet
        m.execute("ip link add name cockpittest1 type veth peer name vcockpittest1")
        self.addCleanup(m.execute, "ip link del dev cockpittest1")

        # has expected interfaces
        b.wait_in_text("[aria-label='Network usage'] [data-interface='cockpittest1']", "cockpittest1")
        b.wait_in_text("[aria-label='Network usage'] [data-interface='lo']", "lo")

        def rateMatches(label, regexp):
            text = b.text(f"[aria-label='Network usage'] [data-interface='lo'] td[data-label='{label}']")
            return re.match(regexp, text) is not None

        # loopback is quiet enough to not transmit MB/s
        b.wait(lambda: rateMatches("In", r'^(0|[0-9.]+ (kB|B)/s)$'))
        b.wait(lambda: rateMatches("Out", r'^(0|[0-9.]+ (kB|B)/s)$'))
        # pipe lots of data through lo
        m.execute("systemd-run --collect --slice cockpittest --unit lo-hog sh -ec "
                  " 'nc -n -vv -l 2000 > /dev/null & sleep 1; nc -vv localhost 2000 </dev/zero'")
        b.wait(lambda: rateMatches("In", r'^[0-9.]+ (MB|GB)/s$'))
        b.wait(lambda: rateMatches("Out", r'^[0-9.]+ (MB|GB)/s$'))
        m.execute("systemctl stop lo-hog")

        # nothing happens on cockpittest1
        b.wait_text("[aria-label='Network usage'] [data-interface='cockpittest1'] td[data-label='In']", "0")
        b.wait_text("[aria-label='Network usage'] [data-interface='cockpittest1'] td[data-label='Out']", "0")


@skipImage("TODO: Arch Linux packagekit support", "arch")
@skipDistroPackage()
class TestMetricsPackages(packagelib.PackageCase):
    def testBasic(self):
        b = self.browser
        m = self.machine

        if m.ostree_image:
            self.login_and_go("/metrics")
            b.wait_in_text(".pf-v5-c-empty-state", "cockpit-pcp is missing")
            b.wait_not_present(".pf-v5-c-empty-state button.pf-m-primary")

            b.click("#metrics-header-section button.pf-m-secondary")
            b.wait_visible("#pcp-settings-modal")
            b.wait_visible("#switch-pmlogger:not(:checked)")
            # no packagekit, can't enable
            b.wait_visible("#switch-pmlogger:disabled")
            b.wait_visible("#switch-pmproxy:disabled")
            return

        if m.image.startswith("debian") or m.image.startswith("ubuntu"):
            m.execute("dpkg --purge cockpit-pcp-dbgsym || true; dpkg --purge cockpit-pcp pcp redis redis-server")
            # HACK: pcp does not clean up correctly on Debian https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=986074
            m.execute("rm -f /etc/systemd/system/pmlogger.service.requires/pmlogger_farm.service")
        else:
            m.execute("rpm --erase --verbose cockpit-pcp pcp redis")
        if "centos-8" in m.image or "rhel-8" in m.image:
            # RHEL 8 ships this in a module, make sure that doesn't hide our fake package
            m.execute("dnf module disable -y redis || true")

        redis_service = redisService(m.image)
        dummy_service = "[Service]\nExecStart=/bin/sleep infinity\n[Install]\nWantedBy=multi-user.target\n"

        cpcp_content = {
            "/usr/share/cockpit/pcp/manifest.json": '{"requires": {"cockpit": "135"}, "bridges": [{"match": { "payload": "metrics1"},"spawn": [ "/usr/libexec/cockpit-pcp" ]}]}',
            "/usr/libexec/cockpit-pcp": "true",
        }
        pcp_content = {
            "/lib/systemd/system/pmlogger.service": dummy_service,
            "/lib/systemd/system/pmproxy.service": dummy_service,
        }
        redis_content = {
            f"/lib/systemd/system/{redis_service}.service": dummy_service,
        }

        self.createPackage("cockpit-pcp", "999", "1", content=cpcp_content, depends="pcp",
                           postinst="chmod +x /usr/libexec/cockpit-pcp")
        self.createPackage("pcp", "999", "1", content=pcp_content, postinst="systemctl daemon-reload")
        self.createPackage("redis", "999", "1", content=redis_content, postinst="systemctl daemon-reload")
        self.enableRepo()
        m.execute("pkcon refresh")

        # install c-pcp from the empty state
        self.login_and_go("/metrics")
        b.wait_in_text(".pf-v5-c-empty-state", "cockpit-pcp is missing")
        b.click(".pf-v5-c-empty-state button.pf-m-primary")
        b.click("#dialog button:contains('Install')")
        b.wait_not_present("#dialog")
        b.click("button:contains('Log out')")
        b.leave_page()
        b.click("button:contains('Reconnect')")
        b.set_val("#login-user-input", "admin")
        b.set_val("#login-password-input", "foobar")
        b.click('#login-button')
        b.enter_page("/metrics")
        b.wait_in_text(".pf-v5-c-empty-state", "Metrics history could not be loaded")
        b.logout()

        # install c-pcp from the Metrics Settings dialog
        m.execute("pkcon remove -y cockpit-pcp pcp")
        self.login_and_go("/metrics")
        b.click("#metrics-header-section button.pf-m-secondary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmlogger:not(:checked)")
        b.click("#switch-pmlogger")
        b.wait_visible("#switch-pmlogger:checked")
        applySettings(b)
        # install dialog
        b.click("#dialog button:contains('Install')")
        b.wait_not_present("#dialog")
        # sets up pmlogger correctly; this is asynchronous, as it happens in the background after closing install dialog
        m.execute('until [ $(systemctl is-enabled pmlogger) = enabled ]; do sleep 1; done')
        # also needs to wait for activating → active
        m.execute('until [ $(systemctl is-active pmlogger) = active ]; do sleep 1; done')
        # triggers "needs logout"
        b.click("button:contains('Log out')")
        b.leave_page()
        b.click("button:contains('Reconnect')")
        b.set_val("#login-user-input", "admin")
        b.set_val("#login-password-input", "foobar")
        b.click('#login-button')
        b.enter_page("/metrics")
        # this is just a fake cockpit-pcp package
        b.wait_in_text(".pf-v5-c-empty-state", "Metrics history could not be loaded")
        b.wait_in_text(".pf-v5-c-empty-state", "pmlogger.service is failing to collect data")

        # install redis
        b.click("#metrics-header-section button.pf-m-secondary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmproxy:not(:checked)")
        b.click("#switch-pmproxy")
        b.wait_visible("#switch-pmproxy:checked")
        applySettings(b)
        # install dialog
        b.click("#dialog button:contains('Install')")
        b.wait_not_present("#dialog")
        # sets up redis correctly; this is asynchronous, as it happens in the background after closing install dialog
        m.execute('until [ $(systemctl is-enabled pmproxy) = enabled ]; do sleep 1; done')
        m.execute('until [ $(systemctl is-active pmproxy) = active ]; do sleep 1; done')
        m.execute(f'until [ $(systemctl is-active {redis_service}) = active ]; do sleep 1; done')
        self.assertIn("redis", m.execute("systemctl show -p Wants --value pmproxy").strip())


@skipDistroPackage()
class TestMultiCPU(MachineCase):

    provision = {
        "0": {"cpus": 2}
    }

    @skipOstree("no PCP support")
    def testCPUUsage(self):
        b = self.browser
        m = self.machine

        prepareArchive(m, "2corescpu.tar.gz", 1598971635)
        login(self)

        # one core is busy, the other idle -- that should be 50% total usage
        self.assertGreaterEqual(getCompressedMinuteValue(self, "cpu", False, 1598968800000, 44), 0.2)
        self.assertLessEqual(getCompressedMinuteValue(self, "cpu", False, 1598968800000, 44), 0.55)

        # next minute, both cores are busy
        self.assertGreaterEqual(getMaximumSpike(self, "cpu", False, 1598968800000, 45), 0.5)
        self.assertLessEqual(getMaximumSpike(self, "cpu", False, 1598968800000, 45), 1.0)

        b.wait_timeout(60)

        # Test current usage of cores
        b.wait_text("#current-cpu-usage-description", "2 CPUs")
        b.wait(lambda: progressValue(self, "#current-cpu-usage") < 20)
        m.execute("systemd-run --collect --slice cockpittest -p CPUQuota=60% --unit cpu-hog dd if=/dev/urandom of=/dev/null")
        m.execute("systemd-run --collect --slice cockpittest -p CPUQuota=30% --unit cpu-piglet dd if=/dev/urandom of=/dev/null")
        # View all CPUs
        b.click("#current-metrics-card-cpu button")
        b.wait(lambda: int(b.text(".pf-v5-c-popover .cpu-all dd:nth-of-type(1)")[:-1]) > 50)
        b.wait(lambda: int(b.text(".pf-v5-c-popover .cpu-all dd:nth-of-type(2)")[:-1]) > 20)
        b.click(".pf-v5-c-popover button")
        b.wait_not_present(".pf-v5-c-popover")

        # the top CPU core runs cpu-hog
        b.wait(lambda: progressValue(self, "#current-top-cpu-usage") >= 58)
        b.wait(lambda: progressValue(self, "#current-top-cpu-usage") <= 70)
        # looks like "average: 45% max: 60%"
        b.wait(lambda: int(b.text("#current-cpu-usage .pf-v5-c-progress__status").split()[-1].rstrip('%')) >= 58)
        b.wait(lambda: int(b.text("#current-cpu-usage .pf-v5-c-progress__status").split()[-1].rstrip('%')) <= 70)


@skipOstree("no PCP support")
@skipDistroPackage()
@skipMobile()
class TestGrafanaClient(MachineCase):

    provision = {
        "0": {"address": "10.111.112.1/20", "dns": "10.111.112.1", "memory_mb": 512},
        # forward Grafana port, so that a developer can connect to it with local browser
        "services": {"image": "services", "forward": {"3000": 3000}, "memory_mb": 512}
    }

    def testBasic(self):
        m = self.machine
        b = self.browser
        mg = self.machines['services']

        # avoid dynamic host name changes during PCP data collection, and start from clean slate
        m.execute("""systemctl stop pmlogger || true
                     systemctl reset-failed pmlogger || true
                     rm -rf /var/log/pcp/pmlogger
                     hostnamectl set-hostname grafana-client""")

        # start Grafana
        mg.execute("/root/run-grafana")
        m.execute("until curl --silent --show-error http://10.111.112.100:3000; do sleep 1; done")
        # enable PCP plugin; like on Cog (Configuration) menu → Plugins → Performance Co-Pilot → Enable
        mg.execute("curl --silent --show-error -u admin:foobar -d '' 'http://127.0.0.1:3000/api/plugins/performancecopilot-pcp-app/settings?enabled=true'")
        self.login_and_go("/metrics")

        # pmlogger data collection is not running initially
        b.wait_in_text(".pf-v5-c-empty-state", "Metrics history could not be loaded")
        b.wait_in_text(".pf-v5-c-empty-state", "pmlogger.service is not running")
        b.click(".pf-v5-c-empty-state button.pf-m-primary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmlogger:not(:checked)")
        b.click("#switch-pmlogger")
        b.wait_visible("#switch-pmlogger:checked")
        applySettings(b)

        # enable pmproxy+redis (none of our test OSes have both of them running by default)
        b.click("#metrics-header-section button.pf-m-secondary")
        b.wait_visible("#pcp-settings-modal")
        b.wait_visible("#switch-pmproxy:not(:checked)")
        b.click('#switch-pmproxy')
        b.wait_visible('#switch-pmproxy:checked')
        applySettings(b)

        # enable pmproxy service in firewalld in the alert
        b.wait_visible("#firewalld-request-pmproxy")
        b.click(".pf-v5-c-alert button.pf-m-primary")

        # Log into Grafana (usually http://127.0.0.2:3002 if you do it interactively)
        bg = Browser(mg.forward['3000'], label=self.label() + "-" + mg.label, machine=self)
        try:
            bg.open("/")
            bg.wait_in_text("body", "Welcome to Grafana")
            bg.set_input_text("input[name='user']", "admin")
            bg.set_input_text("input[name='password']", "foobar")
            bg.click("button:contains('Log in')")
            bg.wait_in_text("body", "Add your first data source")

            # HACK Unsigned plugin needs to be enabled manually
            # See https://github.com/performancecopilot/grafana-pcp/issues/94
            bg.open("/plugins/performancecopilot-pcp-app")
            with bg.wait_timeout(30):
                bg.wait_visible(".gf-form-button-row button")
                if bg.text(".gf-form-button-row button") == "Enable":
                    bg.click(".gf-form-button-row button")
                    bg.wait_text(".gf-form-button-row button", "Disable")

            # Add the PCP redis data source for our client machine
            # Cog (Configuration) menu → Data Sources → Add
            # Select PCP redis, HTTP URL http://10.111.112.1:44322
            redis_url = 'http://10.111.112.1:44322'
            bg.open("/datasources/new")
            bg.wait_visible("[aria-label='Add new data source PCP Redis']")
            bg.click("[aria-label='Add new data source PCP Redis']")
            bg.set_input_text("input[placeholder='http://localhost:44322']", redis_url)
            bg.click("button:contains('Save &')")  # Save & [tT]est
            bg.wait_in_text("body", "Data source is working")

            # Grafana auto-discovers "host" variable for incoming metrics; it takes a while to receive the first
            # measurement; that event is not observable directly in Grafana, and the dashboard does not auto-update to
            # new variables; so probe the API until it appears
            wait(lambda: "grafana-client" in mg.execute(f"curl --max-time 10 --silent --show-error '{redis_url}/series/labels?names=hostname'"), delay=10, tries=30)
            # ... and the load metrics as well
            wait(lambda: mg.execute(f"curl --max-time 10 --silent --show-error '{redis_url}/series/query?expr=kernel.all.load'").strip() != '[]', delay=10, tries=30)

            # Switch to "Dashboards" tab, import "Host Overview"
            bg.click("a[href$='/dashboards'][role=tab]")
            with bg.wait_timeout(60):
                bg.wait_not_in_text("body", "Loading")
            bg.click("tr:contains('PCP Redis: Host Overview') button:contains('Import')")
            bg.wait_visible("tr:contains('PCP Redis: Host Overview') button:contains('Re-import')")

            # .. and the dashboard name becomes clickable
            bg.click("a:contains('PCP Redis: Host Overview')")

            bg.wait_in_text(".submenu-controls", "grafana-client")

            # expect a "Load average" panel with a sensible number
            max_load = bg.text("div:contains('Load average') .graph-legend-series:contains('1 minute') .max")
            self.assertGreater(float(max_load), 0)
        except Exception:
            bg.snapshot("FAIL-grafana")
            raise


if __name__ == '__main__':
    test_main()