diff --git a/.github/workflows/chatbot.yaml b/.github/workflows/chatbot.yaml index 424b4fe3..4fec87e0 100644 --- a/.github/workflows/chatbot.yaml +++ b/.github/workflows/chatbot.yaml @@ -48,53 +48,55 @@ jobs: run: | pip install --no-cache-dir opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation + - name: Download OpenTelemetry Collector Contrib + run: | + wget https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.103.0/otelcol-contrib_0.103.0_linux_amd64.tar.gz + tar -xvf otelcol-contrib_0.103.0_linux_amd64.tar.gz + + - name: Write secrets to files + run: | + echo "${{ secrets.ROSA_OTEL_CACERT }}" > /tmp/ca.crt + echo "${{ secrets.ROSA_OTEL_SERVER_CRT }}" > /tmp/server.crt + echo "${{ secrets.ROSA_OTEL_SERVER_KEY }}" > /tmp/server.key + - name: Configure OpenTelemetry Collector run: | echo ' - extensions: - basicauth/client: - client_auth: - username: "${{ secrets.OTEL_USERNAME }}" - password: "${{ secrets.OTEL_PASSWORD }}" - receivers: - otlp: - protocols: - grpc: - http: - exporters: - otlphttp: - endpoint: https://otc.apps.platform-sts.pcbk.p1.openshiftapps.com - auth: - authenticator: basicauth/client - tls: - insecure: false - ca_pem: "${{ secrets.ROSA_ROOT_CERT }}" - debug: - verbosity: detailed - service: - extensions: [basicauth/client] - pipelines: - traces: - receivers: [otlp] - exporters: [debug, otlphttp] + receivers: + otlp: + protocols: + grpc: + http: + exporters: + otlphttp: + endpoint: "${{ secrets.ROSA_OTEL_ENDPOINT }}" + tls: + insecure: false + cert_file: /tmp/server.crt + key_file: /tmp/server.key + ca_file: /tmp/ca.crt + debug: + verbosity: detailed + service: + pipelines: + traces: + receivers: [otlp] + exporters: [debug, otlphttp] ' > otel-collector-config.yaml - name: Run OpenTelemetry Collector run: | - wget https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.103.0/otelcol-contrib_0.103.0_linux_amd64.tar.gz - tar -xvf otelcol-contrib_0.103.0_linux_amd64.tar.gz - chmod +x otelcol-contrib - ./otelcol-contrib --config otel-collector-config.yaml & + ./otelcol-contrib --config otel-collector-config.yaml > otel-collector.log 2>&1 & - name: Install qemu dependency run: | sudo apt-get update sudo apt-get install -y qemu-user-static - - name: Start build trace + - name: Start job trace run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="build-image" + export WORKFLOW_NAME="chatbot" + export JOB_NAME="chatbot-build-and-push" export TRACE_ACTION="start" python ci/trace-steps.py @@ -108,54 +110,19 @@ jobs: containerfiles: ./recipes/natural_language_processing/${{ env.IMAGE_NAME }}/app/Containerfile context: recipes/natural_language_processing/${{ env.IMAGE_NAME }}/app - - name: End build trace - run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="build-image" - export TRACE_ACTION="end" - python ci/trace-steps.py - - name: Install Dependencies working-directory: ./recipes/natural_language_processing/${{ env.IMAGE_NAME }} run: make install - - name: Start download model trace - run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="download-model" - export TRACE_ACTION="start" - python ci/trace-steps.py - - name: Download model working-directory: ./models run: make download-model-granite - - name: End download model trace - run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="download-model" - export TRACE_ACTION="end" - python ci/trace-steps.py - - - name: Start functional test run trace - run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="run-functional-tests" - export TRACE_ACTION="start" - python ci/trace-steps.py - - name: Run Functional Tests shell: bash run: make functional-tests working-directory: ./recipes/natural_language_processing/${{ env.IMAGE_NAME }} - - name: End functional test run trace - run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="run-functional-tests" - export TRACE_ACTION="end" - python ci/trace-steps.py - - name: Login to Registry if: github.event_name == 'push' && github.ref == 'refs/heads/main' uses: redhat-actions/podman-login@v1.7 @@ -164,13 +131,6 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Start push image trace - run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="push-image" - export TRACE_ACTION="start" - python ci/trace-steps.py - - name: Push Image id: push_image if: github.event_name == 'push' && github.ref == 'refs/heads/main' @@ -180,9 +140,10 @@ jobs: tags: ${{ steps.build_image.outputs.tags }} registry: ${{ env.REGISTRY }} - - name: End push image trace + - name: End job trace run: | - export WORKFLOW_NAME="chatbot-build-push" - export STEP_NAME="push-image" + export WORKFLOW_NAME="chatbot" + export JOB_NAME="chatbot-build-and-push" export TRACE_ACTION="end" python ci/trace-steps.py + diff --git a/.github/workflows/test-trace-steps.yaml b/.github/workflows/test-trace-steps.yaml index 31ad6423..673fe1d0 100644 --- a/.github/workflows/test-trace-steps.yaml +++ b/.github/workflows/test-trace-steps.yaml @@ -12,9 +12,9 @@ on: workflow_dispatch: jobs: - test: + test-build: if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" - runs-on: ubuntu-22.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4.1.7 - name: Set up Python @@ -33,16 +33,13 @@ jobs: - name: Write secrets to files run: | - echo "${{ secrets.ROSA_OTEL_TLS_CERT }}" > /tmp/tls.crt + echo "${{ secrets.ROSA_OTEL_CACERT }}" > /tmp/ca.crt + echo "${{ secrets.ROSA_OTEL_SERVER_CRT }}" > /tmp/server.crt + echo "${{ secrets.ROSA_OTEL_SERVER_KEY }}" > /tmp/server.key - name: Configure OpenTelemetry Collector run: | echo ' - extensions: - basicauth/client: - client_auth: - username: "${{ secrets.OTEL_USERNAME }}" - password: "${{ secrets.OTEL_PASSWORD }}" receivers: otlp: protocols: @@ -50,16 +47,15 @@ jobs: http: exporters: otlphttp: - endpoint: https://otc.apps.platform-sts.pcbk.p1.openshiftapps.com:4318 - auth: - authenticator: basicauth/client + endpoint: "${{ secrets.ROSA_OTEL_ENDPOINT }}" tls: insecure: false - ca_file: /tmp/tls.crt + cert_file: /tmp/server.crt + key_file: /tmp/server.key + ca_file: /tmp/ca.crt debug: verbosity: detailed service: - extensions: [basicauth/client] pipelines: traces: receivers: [otlp] @@ -70,10 +66,10 @@ jobs: run: | ./otelcol-contrib --config otel-collector-config.yaml > otel-collector.log 2>&1 & - - name: Start build trace + - name: Start job trace run: | - export WORKFLOW_NAME="test-workflow" - export STEP_NAME="build" + export WORKFLOW_NAME="test-trace" + export JOB_NAME="test-build" export TRACE_ACTION="start" python ci/trace-steps.py @@ -82,29 +78,15 @@ jobs: echo "Simulating build step..." sleep 2 - - name: End build trace - run: | - export WORKFLOW_NAME="test-workflow" - export STEP_NAME="build" - export TRACE_ACTION="end" - python ci/trace-steps.py - - - name: Start test trace - run: | - export WORKFLOW_NAME="test-workflow" - export STEP_NAME="test" - export TRACE_ACTION="start" - python ci/trace-steps.py - - name: Test run: | echo "Simulating test step..." sleep 2 - - name: End test trace + - name: End job trace run: | - export WORKFLOW_NAME="test-workflow" - export STEP_NAME="test" + export WORKFLOW_NAME="test-trace" + export JOB_NAME="test-build" export TRACE_ACTION="end" python ci/trace-steps.py diff --git a/ci/trace-steps.py b/ci/trace-steps.py index a7c77e99..07db0606 100644 --- a/ci/trace-steps.py +++ b/ci/trace-steps.py @@ -1,75 +1,47 @@ import os import time -import logging +from datetime import datetime from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter -from opentelemetry.trace import SpanContext, TraceFlags, TraceState, NonRecordingSpan +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +service_name = os.getenv("WORKFLOW_NAME", "default_service") +job_name = os.getenv("JOB_NAME", "default_job") -# Set up OpenTelemetry tracing -trace.set_tracer_provider( - TracerProvider( - resource=Resource.create({"service.name": os.getenv("WORKFLOW_NAME")}) - ) -) +resource = Resource.create({"service.name": service_name}) +trace.set_tracer_provider(TracerProvider(resource=resource)) tracer = trace.get_tracer(__name__) - -# Set up OTLP exporter to send to OpenTelemetry Collector -otlp_exporter = OTLPSpanExporter(endpoint="http://0.0.0.0:4317", insecure=True) - -# Set up span processor -span_processor = BatchSpanProcessor(otlp_exporter) -trace.get_tracer_provider().add_span_processor(span_processor) - -# Optionally, export to console for debugging -console_exporter = ConsoleSpanExporter() -trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(console_exporter)) - -def retry_operation(operation, retries=3, delay=5): - for attempt in range(retries): - try: - return operation() - except Exception as e: - logger.error(f"Attempt {attempt + 1} failed with error: {e}") - if attempt < retries - 1: - time.sleep(delay) - else: - raise - -def start_trace(step_name): - span = tracer.start_span(name=step_name) - return span - -def end_trace(span): - span.end() +console_span_processor = BatchSpanProcessor(ConsoleSpanExporter()) +trace.get_tracer_provider().add_span_processor(console_span_processor) + +# Adding OTLP Span Exporter for actual data export +otlp_exporter = OTLPSpanExporter(endpoint="localhost:4317", insecure=True) +otlp_span_processor = BatchSpanProcessor(otlp_exporter) +trace.get_tracer_provider().add_span_processor(otlp_span_processor) + +print("Tracer initialized with service name:", service_name) + +def set_start_time(): + start_time = datetime.now().timestamp() + with open("/tmp/start_time.txt", "w") as file: + file.write(str(start_time)) + print("Start time recorded") + +def calculate_duration(): + with open("/tmp/start_time.txt", "r") as file: + start_time = float(file.read()) + end_time = datetime.now().timestamp() + duration = end_time - start_time + print(f"Total Duration: {duration}s") + with tracer.start_as_current_span(job_name) as span: + span.set_attribute("total_duration_s", duration) if __name__ == "__main__": - step_name = os.getenv("STEP_NAME", "default_step") action = os.getenv("TRACE_ACTION", "start") if action == "start": - span = retry_operation(lambda: start_trace(step_name)) - with open(f"/tmp/trace_{step_name}.txt", "w") as f: - f.write(str(span.get_span_context().trace_id)) + set_start_time() elif action == "end": - trace_id = os.getenv("TRACE_ID") - if not trace_id: - with open(f"/tmp/trace_{step_name}.txt", "r") as f: - trace_id = f.read().strip() - trace_id = int(trace_id, 16) # Convert trace_id back to int - span_context = SpanContext( - trace_id=trace_id, - span_id=0, # Span ID will be generated - trace_flags=TraceFlags(TraceFlags.SAMPLED), - trace_state=TraceState(), - is_remote=True - ) - with tracer.start_as_current_span(name=step_name, context=trace.set_span_in_context(NonRecordingSpan(span_context))): - span = tracer.start_span(name=step_name) - end_trace(span) + calculate_duration()