mirror of
https://github.com/inventree/InvenTree.git
synced 2026-07-04 06:00:38 +00:00
System Health Checks (#12193)
* Add worker health check invoke task * Increase frequency of heartbeat task * Adjust default threshold for worker health check * Add server_health invoke func
This commit is contained in:
@@ -188,13 +188,10 @@ class InvenTreeConfig(AppConfig):
|
|||||||
@ignore_ready_warning
|
@ignore_ready_warning
|
||||||
def add_heartbeat(self):
|
def add_heartbeat(self):
|
||||||
"""Ensure there is at least one background task in the queue."""
|
"""Ensure there is at least one background task in the queue."""
|
||||||
import django_q.models
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if django_q.models.OrmQ.objects.count() == 0:
|
InvenTree.tasks.offload_task(
|
||||||
InvenTree.tasks.offload_task(
|
InvenTree.tasks.heartbeat, force_async=True, group='heartbeat'
|
||||||
InvenTree.tasks.heartbeat, force_async=True, group='heartbeat'
|
)
|
||||||
)
|
|
||||||
except AppRegistryNotReady: # pragma: no cover
|
except AppRegistryNotReady: # pragma: no cover
|
||||||
pass
|
pass
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
@@ -438,21 +438,30 @@ def scheduled_task(
|
|||||||
|
|
||||||
|
|
||||||
@tracer.start_as_current_span('heartbeat')
|
@tracer.start_as_current_span('heartbeat')
|
||||||
@scheduled_task(ScheduledTask.MINUTES, 5)
|
@scheduled_task(ScheduledTask.MINUTES, 1)
|
||||||
def heartbeat():
|
def heartbeat():
|
||||||
"""Simple task which runs at 5 minute intervals, so we can determine that the background worker is actually running.
|
"""Simple task which runs at 1 minute intervals, so we can determine that the background worker is actually running."""
|
||||||
|
|
||||||
(There is probably a less "hacky" way of achieving this)?
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
from django_q.models import OrmQ, Success
|
from django_q.models import OrmQ, Success
|
||||||
except AppRegistryNotReady: # pragma: no cover
|
except AppRegistryNotReady: # pragma: no cover
|
||||||
logger.info('Could not perform heartbeat task - App registry not ready')
|
logger.info('Could not perform heartbeat task - App registry not ready')
|
||||||
return
|
return
|
||||||
|
|
||||||
threshold = timezone.now() - timedelta(minutes=30)
|
# Write a timestamp file so that health checks can verify worker liveness
|
||||||
|
# without needing to start a full Django process.
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# Delete heartbeat results more than half an hour old,
|
try:
|
||||||
|
Path(tempfile.gettempdir()).joinpath('inventree_worker_heartbeat').write_text(
|
||||||
|
str(timezone.now().timestamp())
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
threshold = timezone.now() - timedelta(minutes=15)
|
||||||
|
|
||||||
|
# Delete heartbeat results more than 15 minutes old,
|
||||||
# otherwise they just create extra noise
|
# otherwise they just create extra noise
|
||||||
heartbeats = Success.objects.filter(
|
heartbeats = Success.objects.filter(
|
||||||
func='InvenTree.tasks.heartbeat', started__lte=threshold
|
func='InvenTree.tasks.heartbeat', started__lte=threshold
|
||||||
|
|||||||
@@ -1603,6 +1603,67 @@ def worker(c, verbose: bool = False):
|
|||||||
manage(c, 'qcluster', pty=True, verbose=verbose)
|
manage(c, 'qcluster', pty=True, verbose=verbose)
|
||||||
|
|
||||||
|
|
||||||
|
@task(help={'timeout': 'Maximum minutes since last heartbeat (default: 3)'})
|
||||||
|
def worker_health(c, timeout: int = 3):
|
||||||
|
"""Check if the background worker is healthy by reading the heartbeat file.
|
||||||
|
|
||||||
|
Exits 0 if the worker has run within the last TIMEOUT minutes, 1 otherwise.
|
||||||
|
No Django startup or database access is required.
|
||||||
|
"""
|
||||||
|
heartbeat_file = Path(tempfile.gettempdir()) / 'inventree_worker_heartbeat'
|
||||||
|
|
||||||
|
if heartbeat_file.exists():
|
||||||
|
try:
|
||||||
|
age_seconds = time.time() - float(heartbeat_file.read_text().strip())
|
||||||
|
if age_seconds < timeout * 60:
|
||||||
|
success(
|
||||||
|
f'Worker is healthy (last heartbeat {int(age_seconds) // 60}m {int(age_seconds) % 60}s ago)'
|
||||||
|
)
|
||||||
|
return
|
||||||
|
warning(
|
||||||
|
f'Heartbeat file is stale ({int(age_seconds) // 60}m {int(age_seconds) % 60}s old)'
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
warning(f'Could not read heartbeat file: {e}')
|
||||||
|
else:
|
||||||
|
warning(f'Heartbeat file not found: {heartbeat_file}')
|
||||||
|
|
||||||
|
error('Worker health check failed')
|
||||||
|
raise Exit(code=1)
|
||||||
|
|
||||||
|
|
||||||
|
@task(
|
||||||
|
help={
|
||||||
|
'address': 'Server address to check (default: http://localhost:8000)',
|
||||||
|
'timeout': 'Request timeout in seconds (default: 5)',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
def server_health(c, address: str = 'http://localhost:8000', timeout: int = 5):
|
||||||
|
"""Check if the web server is healthy by requesting /api/system/health/.
|
||||||
|
|
||||||
|
Exits 0 on HTTP 200, 1 otherwise.
|
||||||
|
No Django startup required.
|
||||||
|
"""
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
url = f'{address.rstrip("/")}/api/system/health/'
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout=timeout) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
success(f'Server is healthy ({url})')
|
||||||
|
return
|
||||||
|
warning(f'Unexpected status {response.status} from {url}')
|
||||||
|
except urllib.error.URLError as e:
|
||||||
|
warning(f'Could not reach server at {url}: {e.reason}')
|
||||||
|
except Exception as e:
|
||||||
|
warning(f'Unexpected error checking {url}: {e}')
|
||||||
|
|
||||||
|
error('Server health check failed')
|
||||||
|
raise Exit(code=1)
|
||||||
|
|
||||||
|
|
||||||
@task(post=[static, server])
|
@task(post=[static, server])
|
||||||
def test_translations(c):
|
def test_translations(c):
|
||||||
"""Add a fictional language to test if each component is ready for translations."""
|
"""Add a fictional language to test if each component is ready for translations."""
|
||||||
@@ -2457,6 +2518,8 @@ ns = Collection(
|
|||||||
version,
|
version,
|
||||||
wait,
|
wait,
|
||||||
worker,
|
worker,
|
||||||
|
worker_health,
|
||||||
|
server_health,
|
||||||
monitor,
|
monitor,
|
||||||
build_docs,
|
build_docs,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user