diff --git a/src/backend/InvenTree/InvenTree/apps.py b/src/backend/InvenTree/InvenTree/apps.py index 53c15f01ef..1cbf4d11c7 100644 --- a/src/backend/InvenTree/InvenTree/apps.py +++ b/src/backend/InvenTree/InvenTree/apps.py @@ -188,13 +188,10 @@ class InvenTreeConfig(AppConfig): @ignore_ready_warning def add_heartbeat(self): """Ensure there is at least one background task in the queue.""" - import django_q.models - try: - if django_q.models.OrmQ.objects.count() == 0: - InvenTree.tasks.offload_task( - InvenTree.tasks.heartbeat, force_async=True, group='heartbeat' - ) + InvenTree.tasks.offload_task( + InvenTree.tasks.heartbeat, force_async=True, group='heartbeat' + ) except AppRegistryNotReady: # pragma: no cover pass except Exception: diff --git a/src/backend/InvenTree/InvenTree/tasks.py b/src/backend/InvenTree/InvenTree/tasks.py index 729dd007b7..e44520629b 100644 --- a/src/backend/InvenTree/InvenTree/tasks.py +++ b/src/backend/InvenTree/InvenTree/tasks.py @@ -438,21 +438,30 @@ def scheduled_task( @tracer.start_as_current_span('heartbeat') -@scheduled_task(ScheduledTask.MINUTES, 5) +@scheduled_task(ScheduledTask.MINUTES, 1) def heartbeat(): - """Simple task which runs at 5 minute intervals, so we can determine that the background worker is actually running. - - (There is probably a less "hacky" way of achieving this)? - """ + """Simple task which runs at 1 minute intervals, so we can determine that the background worker is actually running.""" try: from django_q.models import OrmQ, Success except AppRegistryNotReady: # pragma: no cover logger.info('Could not perform heartbeat task - App registry not ready') return - threshold = timezone.now() - timedelta(minutes=30) + # Write a timestamp file so that health checks can verify worker liveness + # without needing to start a full Django process. + import tempfile + from pathlib import Path - # Delete heartbeat results more than half an hour old, + try: + Path(tempfile.gettempdir()).joinpath('inventree_worker_heartbeat').write_text( + str(timezone.now().timestamp()) + ) + except Exception: + pass + + threshold = timezone.now() - timedelta(minutes=15) + + # Delete heartbeat results more than 15 minutes old, # otherwise they just create extra noise heartbeats = Success.objects.filter( func='InvenTree.tasks.heartbeat', started__lte=threshold diff --git a/tasks.py b/tasks.py index 8d0624417f..4f94ff57f6 100644 --- a/tasks.py +++ b/tasks.py @@ -1603,6 +1603,67 @@ def worker(c, verbose: bool = False): manage(c, 'qcluster', pty=True, verbose=verbose) +@task(help={'timeout': 'Maximum minutes since last heartbeat (default: 3)'}) +def worker_health(c, timeout: int = 3): + """Check if the background worker is healthy by reading the heartbeat file. + + Exits 0 if the worker has run within the last TIMEOUT minutes, 1 otherwise. + No Django startup or database access is required. + """ + heartbeat_file = Path(tempfile.gettempdir()) / 'inventree_worker_heartbeat' + + if heartbeat_file.exists(): + try: + age_seconds = time.time() - float(heartbeat_file.read_text().strip()) + if age_seconds < timeout * 60: + success( + f'Worker is healthy (last heartbeat {int(age_seconds) // 60}m {int(age_seconds) % 60}s ago)' + ) + return + warning( + f'Heartbeat file is stale ({int(age_seconds) // 60}m {int(age_seconds) % 60}s old)' + ) + except Exception as e: + warning(f'Could not read heartbeat file: {e}') + else: + warning(f'Heartbeat file not found: {heartbeat_file}') + + error('Worker health check failed') + raise Exit(code=1) + + +@task( + help={ + 'address': 'Server address to check (default: http://localhost:8000)', + 'timeout': 'Request timeout in seconds (default: 5)', + } +) +def server_health(c, address: str = 'http://localhost:8000', timeout: int = 5): + """Check if the web server is healthy by requesting /api/system/health/. + + Exits 0 on HTTP 200, 1 otherwise. + No Django startup required. + """ + import urllib.error + import urllib.request + + url = f'{address.rstrip("/")}/api/system/health/' + + try: + with urllib.request.urlopen(url, timeout=timeout) as response: + if response.status == 200: + success(f'Server is healthy ({url})') + return + warning(f'Unexpected status {response.status} from {url}') + except urllib.error.URLError as e: + warning(f'Could not reach server at {url}: {e.reason}') + except Exception as e: + warning(f'Unexpected error checking {url}: {e}') + + error('Server health check failed') + raise Exit(code=1) + + @task(post=[static, server]) def test_translations(c): """Add a fictional language to test if each component is ready for translations.""" @@ -2457,6 +2518,8 @@ ns = Collection( version, wait, worker, + worker_health, + server_health, monitor, build_docs, )