chrome-ec/util/temp_metrics.conf

397 lines
10 KiB
Plaintext

# Copyright 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
description "Temporary, quick-hack metrics collection & thermal daemon"
author "chromium-os-dev@chromium.org"
# This is for quickly adding UMA stats that we may need for
# short-term experiments, when we don't have the time to add
# stuff to metrics_daemon. That's where it should go in the
# long term.
#
# This is also currently doing a userland thermal loop to allow
# for quick experimentation. This thermal loop will eventually
# move to the BIOS once the data from experiments help prove its
# efficacy.
start on started system-services
stop on stopping system-services
respawn
script
TEMP_OFFSET=273 # difference between K (reported by EC) and C (used in UMA)
# Thermal loop fields
CPU_MAX_FREQ_FIELD=1
CPU_MIN_FREQ_FIELD=2
GPU_MAX_FREQ_FIELD=3
CPU_DUTY_CYCLE_FIELD=4
PKG_POWER_LIMIT_FIELD=5
# Thermal loop steps
all_steps="
1801000 800000 1150 0 0x180aa00dd8088 # no throttling
1801000 800000 1150 0 0x180aa00dd8080 # cap pkg to 16W
1801000 800000 1150 0 0x180aa00dd8078 # cap pkg to 15W
1801000 800000 1150 0 0x180aa00dd8070 # cap pkg to 14W
1801000 800000 1150 0 0x180aa00dd8068 # cap pkg to 13W
1800000 800000 900 0 0x180aa00dd8068 # disable turbo
1600000 800000 800 0 0x180aa00dd8068 # cap CPU & GPU frequency
1400000 800000 700 0 0x180aa00dd8068 # cap CPU & GPU frequency
1200000 800000 600 0 0x180aa00dd8068 # cap CPU & GPU frequency
1000000 800000 500 0 0x180aa00dd8068 # cap CPU & GPU frequency
800000 800000 400 0 0x180aa00dd8068 # cap CPU & GPU frequency
800000 800000 350 0 0x180aa00dd8068 # cap CPU & GPU frequency
800000 800000 350 0x1c 0x180aa00dd8068 # duty cycle CPU
800000 800000 350 0x18 0x180aa00dd8068 # duty cycle CPU
"
max_steps=$(($(echo "$all_steps" | wc -l) - 3))
get_step() {
row=$(($1 + 2))
out=$(echo "$all_steps" | awk "{if (NR==$row) print}")
echo "$out"
}
get_field() {
out=$(echo "$2" | awk "{print \$$1}")
echo $out
}
get_peci_temp() {
tempk=$(ectool temps 9 | sed 's/[^0-9]//g')
tempc=$((tempk - $TEMP_OFFSET))
echo $tempc
}
get_sensor_temp() {
s=$1
tempc=0
if out=$(ectool temps $s); then
tempk=$(echo $out | sed 's/[^0-9]//g')
tempc=$((tempk - $TEMP_OFFSET))
fi
echo $tempc
}
get_sensor_list() {
# USB C-Object: 1 or 13
# PCH D-Object: 3
# Hinge C-Object: 5 or 15
# Charger D-Object: 7
if ectool tempsinfo 1 | grep -q "USB C-Object"; then
usb_c_object=1
else
usb_c_object=13
fi
charger_d_object=7
echo $usb_c_object $charger_d_object
}
set_calibration_data() {
B0='-2.94e-5'
B1='-5.7e-7'
B2='4.63e-9'
USB_C_S0='2.712e-14'
PCH_D_S0='9.301e-14'
HINGE_C_S0='-11.000e-14'
CHARGER_D_S0='5.141e-14'
# Note that the sensor numbering is different between the ectool tmp006
# and temps/tempsinfo commands.
USB_C="0 $USB_C_S0 $B0 $B1 $B2"
PCH_D="1 $PCH_D_S0 $B0 $B1 $B2"
HINGE_C="2 $HINGE_C_S0 $B0 $B1 $B2"
CHARGER_D="3 $CHARGER_D_S0 $B0 $B1 $B2"
for i in "$USB_C" "$PCH_D" "$HINGE_C" "$CHARGER_D"; do
# Add "--" otherwise ectool will barf when trying to parse negative
# coefficients.
ectool tmp006cal -- $i
done
}
max_skin_temp=0
sensor_temperatures=
get_max_skin_temp() {
sensor_temperatures=
max_skin_temp=0
for i in $*; do
t=$(get_sensor_temp $i)
sensor_temperatures=$sensor_temperatures$i:$t:
if [ $t -gt $max_skin_temp ]; then
max_skin_temp=$t
fi
done
# Record the PECI CPU temperature also.
i=9
t=$(get_sensor_temp $i)
sensor_temperatures=$sensor_temperatures$i:$t:
}
set_cpu_freq() {
max_freq=$1
min_freq=$2
for cpu in /sys/devices/system/cpu/cpu?/cpufreq; do
echo 800000 > $cpu/scaling_min_freq
echo 800000 > $cpu/scaling_max_freq
echo $max_freq > $cpu/scaling_max_freq
echo $min_freq > $cpu/scaling_min_freq
done
}
set_gpu_min_freq() {
GPU_MIN_FREQ=450
echo $GPU_MIN_FREQ > /sys/kernel/debug/dri/0/i915_min_freq
}
set_gpu_max_freq() {
gpu_max_freq=$1
if [ $GPU_MIN_FREQ -gt $gpu_max_freq ]; then
gpu_max_freq=$GPU_MIN_FREQ
fi
echo $gpu_max_freq > /sys/kernel/debug/dri/0/i915_max_freq
}
set_duty_cycle() {
duty_cycle=$1
for i in 0 1 2 3; do
iotools wrmsr $i 0x19a $duty_cycle
done
}
set_pkg_power_limit() {
pwr_limit=$1
iotools wrmsr 0 0x610 $pwr_limit
}
log_message() {
logger -t temp_metrics "$*"
}
TEMP_THRESHOLD_1=38
TEMP_THRESHOLD_1_WM=40
TEMP_THRESHOLD_2=45
TEMP_THRESHOLD_2_WM=47
TEMP_THRESHOLD_3=50
TEMP_THRESHOLD_3_WM=50
TEMP_THRESHOLD_0_MIN_STEP=0
TEMP_THRESHOLD_0_MAX_STEP=0
TEMP_THRESHOLD_1_MIN_STEP=1
TEMP_THRESHOLD_1_MAX_STEP=5
TEMP_THRESHOLD_2_MIN_STEP=6
TEMP_THRESHOLD_2_MAX_STEP=9
TEMP_THRESHOLD_3_MIN_STEP=10
TEMP_THRESHOLD_3_MAX_STEP=13
current_step=1
new_step=0
thermal_loop() {
# Hack to reset turbo activation threshold since BIOS can change it
# underneath us.
iotools wrmsr 0 0x64c 0x12
skin_temp=$1
if [ $skin_temp -gt $TEMP_THRESHOLD_3 ]; then
temp_watermark=$TEMP_THRESHOLD_3_WM
min_step=$TEMP_THRESHOLD_3_MIN_STEP
max_step=$TEMP_THRESHOLD_3_MAX_STEP
elif [ $skin_temp -gt $TEMP_THRESHOLD_2 ]; then
temp_watermark=$TEMP_THRESHOLD_2_WM
min_step=$TEMP_THRESHOLD_2_MIN_STEP
max_step=$TEMP_THRESHOLD_2_MAX_STEP
elif [ $skin_temp -gt $TEMP_THRESHOLD_1 ]; then
temp_watermark=$TEMP_THRESHOLD_1_WM
min_step=$TEMP_THRESHOLD_1_MIN_STEP
max_step=$TEMP_THRESHOLD_1_MAX_STEP
else
temp_watermark=0
min_step=$TEMP_THRESHOLD_0_MIN_STEP
max_step=$TEMP_THRESHOLD_0_MAX_STEP
fi
if [ $skin_temp -gt $temp_watermark ]; then
if [ $current_step -ne $max_step ]; then
new_step=$(($current_step + 1))
fi
elif [ $skin_temp -lt $temp_watermark ]; then
if [ $current_step -gt $min_step ]; then
new_step=$(($current_step - 1))
fi
else
new_step=$current_step
fi
if [ $new_step -gt $max_step ]; then
new_step=$max_step
elif [ $new_step -lt $min_step ]; then
new_step=$min_step
fi
if [ $new_step -eq $current_step ]; then
return
fi
current_step=$new_step
step=$(get_step $new_step)
log_message "Throttling (temps: $sensor_temperatures):" $step
cpu_max_freq=$(get_field $CPU_MAX_FREQ_FIELD "$step")
cpu_min_freq=$(get_field $CPU_MIN_FREQ_FIELD "$step")
gpu_max_freq=$(get_field $GPU_MAX_FREQ_FIELD "$step")
cpu_duty_cycle=$(get_field $CPU_DUTY_CYCLE_FIELD "$step")
pkg_power_limit=$(get_field $PKG_POWER_LIMIT_FIELD "$step")
set_cpu_freq $cpu_max_freq $cpu_min_freq
set_gpu_max_freq $gpu_max_freq
set_duty_cycle $cpu_duty_cycle
set_pkg_power_limit $pkg_power_limit
}
get_fan_rpm() {
echo $(ectool pwmgetfanrpm | sed 's/[^0-9]//g')
}
set_fan_rpm() {
ectool pwmsetfanrpm $1
}
reset_fan_thresholds() {
temp_low1=105
temp_low2=105
temp_low3=105
temp_low4=105
temp_low5=105
temp_low6=105
}
last_rpm=10
temp_low1=105
temp_low2=105
temp_low3=105
temp_low4=105
temp_low5=105
temp_low6=105
fan_loop() {
skin_temp=$1
if [ $skin_temp -gt 48 ] || [ $skin_temp -gt $temp_low1 ]; then
rpm=9300
reset_fan_thresholds
temp_low1=46
elif [ $skin_temp -gt 44 ] || [ $skin_temp -gt $temp_low2 ]; then
rpm=8000
reset_fan_thresholds
temp_low2=43
elif [ $skin_temp -gt 42 ] || [ $skin_temp -gt $temp_low3 ]; then
rpm=7000
reset_fan_thresholds
temp_low3=41
elif [ $skin_temp -gt 40 ] || [ $skin_temp -gt $temp_low4 ]; then
rpm=5500
reset_fan_thresholds
temp_low4=39
elif [ $skin_temp -gt 38 ] || [ $skin_temp -gt $temp_low5 ]; then
rpm=4000
reset_fan_thresholds
temp_low5=34
elif [ $skin_temp -gt 33 ] || [ $skin_temp -gt $temp_low6 ]; then
rpm=3000
reset_fan_thresholds
temp_low6=30
else
rpm=0
reset_fan_thresholds
fi
# During S0->S3->S0 transitions, the EC sets the fan RPM to 0. This script
# isn't aware of such transitions. Read the current fan RPM again to see
# if it got set to 0. Note that comparing the current fan RPM against last
# requested RPM won't suffice since the actual fan RPM may not be exactly
# what was requested.
cur_rpm=$(get_fan_rpm)
if ([ $cur_rpm -ne 0 ] && [ $last_rpm -eq $rpm ]) || \
([ $cur_rpm -eq 0 ] && [ $rpm -eq 0 ]); then
last_rpm=$rpm
return
fi
log_message "Setting fan RPM (temps: $sensor_temperatures): $last_rpm -> $rpm"
last_rpm=$rpm
set_fan_rpm $rpm
}
# Thermal zone 1 is for operating systems where a userland thermal loop
# doesn't exist. Disable it.
if [ -e /sys/class/thermal/thermal_zone1/mode ]; then
echo -n 'disabled' > /sys/class/thermal/thermal_zone1/mode
fi
# Enable the fan in case no other code has enabled it.
ectool fanduty 0
# Get list of sensors to monitor.
sensor_list=$(get_sensor_list)
# Set sensor calibration data.
set_calibration_data
# Set minimum GPU frequency.
set_gpu_min_freq
loop_count=0
ec_fan_loop=0
while true; do
sleep 10
loop_count=$(($loop_count + 1))
# Read the max skin temperature.
get_max_skin_temp $sensor_list
if [ $max_skin_temp -eq 0 ]; then
if [ $ec_fan_loop -eq 0 ]; then
log_message "Invalid max skin temp. Switching to EC fan loop."
ectool autofanctrl
ec_fan_loop=1
last_rpm=10
fi
else
# Run the fan loop.
fan_loop $max_skin_temp
ec_fan_loop=0
# Run the thermal loop.
thermal_loop $max_skin_temp
fi
# Report the metrics once every 30 seconds.
if [ $loop_count -lt 3 ]; then
continue
fi
loop_count=0
ectool temps all | while read line; do
index=$(printf "%02d" "${line%%:*}")
tempk="${line##* }"
tempc=$(($tempk - $TEMP_OFFSET))
# ignore values below freezing
if [ $tempc -lt 0 ]; then
tempc=0
fi
# Use a linear histogram with 1 C buckets starting at 0.
N_SLOTS=180
metrics_client -e Platform.Temperature.Sensor$index $tempc $N_SLOTS
done
done
end script