2
0
mirror of https://github.com/inventree/InvenTree.git synced 2026-07-04 06:00:38 +00:00

System Health Checks (#12193)

* Add worker health check invoke task

* Increase frequency of heartbeat task

* Adjust default threshold for worker health check

* Add server_health invoke func
This commit is contained in:
Oliver
2026-06-18 12:46:46 +10:00
committed by GitHub
parent c126e2b0af
commit 4b29032c6e
3 changed files with 82 additions and 13 deletions
+3 -6
View File
@@ -188,13 +188,10 @@ class InvenTreeConfig(AppConfig):
@ignore_ready_warning
def add_heartbeat(self):
"""Ensure there is at least one background task in the queue."""
import django_q.models
try:
if django_q.models.OrmQ.objects.count() == 0:
InvenTree.tasks.offload_task(
InvenTree.tasks.heartbeat, force_async=True, group='heartbeat'
)
InvenTree.tasks.offload_task(
InvenTree.tasks.heartbeat, force_async=True, group='heartbeat'
)
except AppRegistryNotReady: # pragma: no cover
pass
except Exception:
+16 -7
View File
@@ -438,21 +438,30 @@ def scheduled_task(
@tracer.start_as_current_span('heartbeat')
@scheduled_task(ScheduledTask.MINUTES, 5)
@scheduled_task(ScheduledTask.MINUTES, 1)
def heartbeat():
"""Simple task which runs at 5 minute intervals, so we can determine that the background worker is actually running.
(There is probably a less "hacky" way of achieving this)?
"""
"""Simple task which runs at 1 minute intervals, so we can determine that the background worker is actually running."""
try:
from django_q.models import OrmQ, Success
except AppRegistryNotReady: # pragma: no cover
logger.info('Could not perform heartbeat task - App registry not ready')
return
threshold = timezone.now() - timedelta(minutes=30)
# Write a timestamp file so that health checks can verify worker liveness
# without needing to start a full Django process.
import tempfile
from pathlib import Path
# Delete heartbeat results more than half an hour old,
try:
Path(tempfile.gettempdir()).joinpath('inventree_worker_heartbeat').write_text(
str(timezone.now().timestamp())
)
except Exception:
pass
threshold = timezone.now() - timedelta(minutes=15)
# Delete heartbeat results more than 15 minutes old,
# otherwise they just create extra noise
heartbeats = Success.objects.filter(
func='InvenTree.tasks.heartbeat', started__lte=threshold
+63
View File
@@ -1603,6 +1603,67 @@ def worker(c, verbose: bool = False):
manage(c, 'qcluster', pty=True, verbose=verbose)
@task(help={'timeout': 'Maximum minutes since last heartbeat (default: 3)'})
def worker_health(c, timeout: int = 3):
"""Check if the background worker is healthy by reading the heartbeat file.
Exits 0 if the worker has run within the last TIMEOUT minutes, 1 otherwise.
No Django startup or database access is required.
"""
heartbeat_file = Path(tempfile.gettempdir()) / 'inventree_worker_heartbeat'
if heartbeat_file.exists():
try:
age_seconds = time.time() - float(heartbeat_file.read_text().strip())
if age_seconds < timeout * 60:
success(
f'Worker is healthy (last heartbeat {int(age_seconds) // 60}m {int(age_seconds) % 60}s ago)'
)
return
warning(
f'Heartbeat file is stale ({int(age_seconds) // 60}m {int(age_seconds) % 60}s old)'
)
except Exception as e:
warning(f'Could not read heartbeat file: {e}')
else:
warning(f'Heartbeat file not found: {heartbeat_file}')
error('Worker health check failed')
raise Exit(code=1)
@task(
help={
'address': 'Server address to check (default: http://localhost:8000)',
'timeout': 'Request timeout in seconds (default: 5)',
}
)
def server_health(c, address: str = 'http://localhost:8000', timeout: int = 5):
"""Check if the web server is healthy by requesting /api/system/health/.
Exits 0 on HTTP 200, 1 otherwise.
No Django startup required.
"""
import urllib.error
import urllib.request
url = f'{address.rstrip("/")}/api/system/health/'
try:
with urllib.request.urlopen(url, timeout=timeout) as response:
if response.status == 200:
success(f'Server is healthy ({url})')
return
warning(f'Unexpected status {response.status} from {url}')
except urllib.error.URLError as e:
warning(f'Could not reach server at {url}: {e.reason}')
except Exception as e:
warning(f'Unexpected error checking {url}: {e}')
error('Server health check failed')
raise Exit(code=1)
@task(post=[static, server])
def test_translations(c):
"""Add a fictional language to test if each component is ready for translations."""
@@ -2457,6 +2518,8 @@ ns = Collection(
version,
wait,
worker,
worker_health,
server_health,
monitor,
build_docs,
)