git.sr.ht/gitsrht-periodic

114 lines
3.6 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import sys
import math
import random
import sqlalchemy as sa
import subprocess
import gitsrht.repos as gr
import prometheus_client
from prometheus_client import CollectorRegistry, Gauge
from prometheus_client.context_managers import Timer
from srht.config import cfg
from srht.database import DbSession
from gitsrht.types import Artifact, User, Repository
from minio import Minio
from datetime import datetime, timedelta
db = DbSession(cfg("git.sr.ht", "connection-string"))
db.init()
registry = CollectorRegistry()
tg = Gauge("gitsrht_periodic_time",
"Time to run gitsrht-periodic jobs",
["section"],
registry=registry)
gc_git_t = tg.labels("gc_git")
@gc_git_t.time()
def gc_git():
repo_count = Repository.query.count()
# *srht-periodic scripts are run every twenty minutes,
# this gives us 504 runs over the course of a week;
# hence, if we GC a 504th of the repository count each time,
# on average, we will have GCd every repo around once a week.
limit = int(math.ceil(repo_count / (7 * 24 * 60 / 20)))
rc = Gauge("gitsrht_periodic_gc_git_count",
"Amount of repos GCd by the gc_git job",
registry=registry)
ps = Gauge("gitsrht_periodic_gc_git_packsize",
"Packfile size in the gc_git job (B)",
["stage"],
registry=registry)
repos = (Repository.query
.offset(random.randrange(0, repo_count + 1 - limit))
.limit(limit)).all()
for r in repos:
try:
ps.labels("pre").inc(sum(map(lambda p: p.stat().st_size,
os.scandir(os.path.join(r.path, "objects", "pack")))))
def gc():
subprocess.run(["git", "-C", r.path, "gc", "--quiet"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
gc()
ps.labels("post").inc(sum(map(lambda p: p.stat().st_size,
os.scandir(os.path.join(r.path, "objects", "pack")))))
rc.inc()
except FileNotFoundError:
continue
gc_s3_t = tg.labels("gc_s3")
@gc_s3_t.time()
def gc_s3():
if not gr.object_storage_enabled:
return
# Once a weekish
if random.randrange(0, int(math.ceil(7 * 24 * 60 / 20))) != 0:
return
prefix = os.path.join(gr.s3_prefix, "artifacts")
minio = Minio(gr.s3_upstream, access_key=gr.s3_access_key,
secret_key=gr.s3_secret_key, secure=True)
objs = set(obj.object_name for obj
in minio.list_objects(gr.s3_bucket, prefix, recursive=True))
artifacts = Artifact.query.all()
users = {u.id: u for u in (User.query .filter(User.id.in_(
set(a.user_id for a in artifacts)))).all()}
repos = {r.id: r for r in (Repository.query.filter(Repository.id.in_(
set(a.repo_id for a in artifacts)))).all()}
for art in artifacts:
artifact_path = os.path.join(prefix, users[art.user_id].canonical_name,
repos[art.repo_id].name, art.filename)
objs.discard(artifact_path)
if not objs:
return
errs = list(minio.remove_objects(gr.s3_bucket, objs))
if errs:
raise Exception(f"While removing dangling artifacts {objs}, got errors: {errs}")
Gauge("gitsrht_periodic_gc_s3_count",
"Amount of objects pruned by the gc_s3 job",
registry=registry).set(len(objs))
all_t = tg.labels("total")
@all_t.time()
def all():
gc_git()
gc_s3()
all()
pg_endpoint = cfg("sr.ht", "pushgateway", default=None)
if pg_endpoint:
prometheus_client.pushadd_to_gateway(pg_endpoint,
job="git.sr.ht", registry=registry)