mirror of
https://github.com/inventree/InvenTree.git
synced 2026-07-04 06:00:38 +00:00
System Health Checks (#12193)
* Add worker health check invoke task * Increase frequency of heartbeat task * Adjust default threshold for worker health check * Add server_health invoke func
This commit is contained in:
@@ -188,13 +188,10 @@ class InvenTreeConfig(AppConfig):
|
||||
@ignore_ready_warning
|
||||
def add_heartbeat(self):
|
||||
"""Ensure there is at least one background task in the queue."""
|
||||
import django_q.models
|
||||
|
||||
try:
|
||||
if django_q.models.OrmQ.objects.count() == 0:
|
||||
InvenTree.tasks.offload_task(
|
||||
InvenTree.tasks.heartbeat, force_async=True, group='heartbeat'
|
||||
)
|
||||
InvenTree.tasks.offload_task(
|
||||
InvenTree.tasks.heartbeat, force_async=True, group='heartbeat'
|
||||
)
|
||||
except AppRegistryNotReady: # pragma: no cover
|
||||
pass
|
||||
except Exception:
|
||||
|
||||
@@ -438,21 +438,30 @@ def scheduled_task(
|
||||
|
||||
|
||||
@tracer.start_as_current_span('heartbeat')
|
||||
@scheduled_task(ScheduledTask.MINUTES, 5)
|
||||
@scheduled_task(ScheduledTask.MINUTES, 1)
|
||||
def heartbeat():
|
||||
"""Simple task which runs at 5 minute intervals, so we can determine that the background worker is actually running.
|
||||
|
||||
(There is probably a less "hacky" way of achieving this)?
|
||||
"""
|
||||
"""Simple task which runs at 1 minute intervals, so we can determine that the background worker is actually running."""
|
||||
try:
|
||||
from django_q.models import OrmQ, Success
|
||||
except AppRegistryNotReady: # pragma: no cover
|
||||
logger.info('Could not perform heartbeat task - App registry not ready')
|
||||
return
|
||||
|
||||
threshold = timezone.now() - timedelta(minutes=30)
|
||||
# Write a timestamp file so that health checks can verify worker liveness
|
||||
# without needing to start a full Django process.
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Delete heartbeat results more than half an hour old,
|
||||
try:
|
||||
Path(tempfile.gettempdir()).joinpath('inventree_worker_heartbeat').write_text(
|
||||
str(timezone.now().timestamp())
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
threshold = timezone.now() - timedelta(minutes=15)
|
||||
|
||||
# Delete heartbeat results more than 15 minutes old,
|
||||
# otherwise they just create extra noise
|
||||
heartbeats = Success.objects.filter(
|
||||
func='InvenTree.tasks.heartbeat', started__lte=threshold
|
||||
|
||||
@@ -1603,6 +1603,67 @@ def worker(c, verbose: bool = False):
|
||||
manage(c, 'qcluster', pty=True, verbose=verbose)
|
||||
|
||||
|
||||
@task(help={'timeout': 'Maximum minutes since last heartbeat (default: 3)'})
|
||||
def worker_health(c, timeout: int = 3):
|
||||
"""Check if the background worker is healthy by reading the heartbeat file.
|
||||
|
||||
Exits 0 if the worker has run within the last TIMEOUT minutes, 1 otherwise.
|
||||
No Django startup or database access is required.
|
||||
"""
|
||||
heartbeat_file = Path(tempfile.gettempdir()) / 'inventree_worker_heartbeat'
|
||||
|
||||
if heartbeat_file.exists():
|
||||
try:
|
||||
age_seconds = time.time() - float(heartbeat_file.read_text().strip())
|
||||
if age_seconds < timeout * 60:
|
||||
success(
|
||||
f'Worker is healthy (last heartbeat {int(age_seconds) // 60}m {int(age_seconds) % 60}s ago)'
|
||||
)
|
||||
return
|
||||
warning(
|
||||
f'Heartbeat file is stale ({int(age_seconds) // 60}m {int(age_seconds) % 60}s old)'
|
||||
)
|
||||
except Exception as e:
|
||||
warning(f'Could not read heartbeat file: {e}')
|
||||
else:
|
||||
warning(f'Heartbeat file not found: {heartbeat_file}')
|
||||
|
||||
error('Worker health check failed')
|
||||
raise Exit(code=1)
|
||||
|
||||
|
||||
@task(
|
||||
help={
|
||||
'address': 'Server address to check (default: http://localhost:8000)',
|
||||
'timeout': 'Request timeout in seconds (default: 5)',
|
||||
}
|
||||
)
|
||||
def server_health(c, address: str = 'http://localhost:8000', timeout: int = 5):
|
||||
"""Check if the web server is healthy by requesting /api/system/health/.
|
||||
|
||||
Exits 0 on HTTP 200, 1 otherwise.
|
||||
No Django startup required.
|
||||
"""
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
url = f'{address.rstrip("/")}/api/system/health/'
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=timeout) as response:
|
||||
if response.status == 200:
|
||||
success(f'Server is healthy ({url})')
|
||||
return
|
||||
warning(f'Unexpected status {response.status} from {url}')
|
||||
except urllib.error.URLError as e:
|
||||
warning(f'Could not reach server at {url}: {e.reason}')
|
||||
except Exception as e:
|
||||
warning(f'Unexpected error checking {url}: {e}')
|
||||
|
||||
error('Server health check failed')
|
||||
raise Exit(code=1)
|
||||
|
||||
|
||||
@task(post=[static, server])
|
||||
def test_translations(c):
|
||||
"""Add a fictional language to test if each component is ready for translations."""
|
||||
@@ -2457,6 +2518,8 @@ ns = Collection(
|
||||
version,
|
||||
wait,
|
||||
worker,
|
||||
worker_health,
|
||||
server_health,
|
||||
monitor,
|
||||
build_docs,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user