Spaces:

MaryAngel
/

ci-cd-anomaly-detection

Configuration error

App Files Files Community

maryangel101 commited on Sep 3, 2025

Commit

8d2ec7a

0 Parent(s):

Initial commit with calculator app and CI/CD workflow

Browse files

Files changed (10) hide show

.github/scripts/anomaly_detector.py +90 -0
.github/workflows/anomaly-check.yml +46 -0
.github/workflows/main.yml +28 -0
api_service.py +150 -0
calculator.py +15 -0
collect_logs.py +71 -0
preprocess_logs.py +136 -0
requirements.txt +10 -0
test_calculator.py +28 -0
test_system.py +110 -0

.github/scripts/anomaly_detector.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import requests
+import sys
+import json
+def download_workflow_logs(run_id, token):
+    """Download logs from a specific workflow run"""
+    headers = {
+        'Authorization': f'token {token}',
+        'Accept': 'application/vnd.github.v3+json'
+    }
+    # Get repository info from environment
+    repo_full_name = os.environ.get('GITHUB_REPOSITORY')
+    # Download logs
+    url = f'https://api.github.com/repos/{repo_full_name}/actions/runs/{run_id}/logs'
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        return response.text
+    else:
+        print(f"Failed to download logs: {response.status_code}")
+        return None
+def analyze_logs_with_model(log_content, model_url):
+    """Send logs to the model API for analysis"""
+    try:
+        response = requests.post(
+            f"{model_url}/predict",
+            json={
+                "log_content": log_content,
+                "include_explanation": True
+            },
+            timeout=30
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            print(f"Model API error: {response.status_code}")
+            return None
+    except requests.RequestException as e:
+        print(f"Failed to call model API: {e}")
+        return None
+def main():
+    # Get environment variables
+    github_token = os.environ.get('GITHUB_TOKEN')
+    workflow_run_id = os.environ.get('WORKFLOW_RUN_ID')
+    model_api_url = os.environ.get('MODEL_API_URL')
+    if not all([github_token, workflow_run_id, model_api_url]):
+        print("Missing required environment variables")
+        sys.exit(1)
+    print(f"Analyzing workflow run: {workflow_run_id}")
+    # Download logs
+    logs = download_workflow_logs(workflow_run_id, github_token)
+    if not logs:
+        print("Failed to download logs")
+        sys.exit(1)
+    # Analyze with model
+    result = analyze_logs_with_model(logs, model_api_url)
+    if not result:
+        print("Failed to analyze logs with model")
+        sys.exit(1)
+    print(f"Analysis result: {json.dumps(result, indent=2)}")
+    # Check if anomaly detected
+    if result.get('is_anomaly', False):
+        print("🚨 ANOMALY DETECTED!")
+        print(f"Confidence: {result.get('confidence', 0):.2%}")
+        print(f"Anomaly Probability: {result.get('anomaly_probability', 0):.2%}")
+        if result.get('explanation'):
+            print("Explanation:", result['explanation'])
+        # Exit with error code to mark step as failed
+        sys.exit(1)
+    else:
+        print("✅ No anomalies detected")
+        sys.exit(0)
+if __name__ == "__main__":
+    main()

.github/workflows/anomaly-check.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+name: Anomaly Detection Check
+on:
+  workflow_run:
+    workflows: ["CI Pipeline"]
+    types:
+      - completed
+jobs:
+  anomaly-check:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion != 'cancelled' }}
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.9'
+    - name: Install dependencies
+      run: |
+        pip install requests
+    - name: Download and analyze workflow logs
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+        MODEL_API_URL: ${{ secrets.MODEL_API_URL }}  # Set this in your repo secrets
+      run: |
+        python .github/scripts/anomaly_detector.py
+    - name: Create issue if anomaly detected
+      if: failure()
+      uses: actions/github-script@v6
+      with:
+        script: |
+          github.rest.issues.create({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            title: 'CI/CD Anomaly Detected',
+            body: 'An anomaly was detected in the recent CI/CD pipeline run. Please investigate.',
+            labels: ['anomaly', 'ci-cd']
+          })

.github/workflows/main.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: CI Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.9'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Run tests
+      run: |
+        pytest -v test_calculator.py

api_service.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import joblib
+import re
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_extraction.text import TfidfVectorizer
+import shap
+import numpy as np
+app = FastAPI(title="CI/CD Log Anomaly Detection API")
+# Global variables for model and vectorizer
+model = None
+vectorizer = None
+explainer = None
+class LogRequest(BaseModel):
+    log_content: str
+    include_explanation: bool = False
+class PredictionResponse(BaseModel):
+    is_anomaly: bool
+    confidence: float
+    anomaly_probability: float
+    explanation: dict = None
+def clean_log(log_content):
+    """Clean and normalize log content"""
+    # Remove timestamps
+    log_content = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z', '[TIMESTAMP]', log_content)
+    # Remove specific IDs and numbers
+    log_content = re.sub(r'run_\d+', 'run_ID', log_content)
+    log_content = re.sub(r'job_\d+', 'job_ID', log_content)
+    log_content = re.sub(r'\b\d{8,}\b', '[ID]', log_content)
+    # Remove paths that might be environment-specific
+    log_content = re.sub(r'/home/[^/\s]+', '/home/USER', log_content)
+    log_content = re.sub(r'/tmp/[^/\s]+', '/tmp/TEMP', log_content)
+    # Normalize whitespace
+    log_content = re.sub(r'\s+', ' ', log_content)
+    return log_content.strip()
+@app.on_event("startup")
+async def load_model():
+    """Load the trained model and vectorizer on startup"""
+    global model, vectorizer, explainer
+    try:
+        model = joblib.load('anomaly_model.pkl')
+        vectorizer = joblib.load('vectorizer.pkl')
+        # Create SHAP explainer (simplified for speed)
+        print("Initializing SHAP explainer...")
+    except FileNotFoundError:
+        print("Warning: Model files not found. Please train the model first.")
+def get_feature_importance_explanation(log_content, prediction_proba):
+    """Get simple feature importance explanation"""
+    try:
+        cleaned_log = clean_log(log_content)
+        log_vector = vectorizer.transform([cleaned_log])
+        # Get feature names and their importance
+        feature_names = vectorizer.get_feature_names_out()
+        feature_weights = log_vector.toarray()[0]
+        # Get top features that contributed to the prediction
+        top_indices = np.argsort(feature_weights)[-10:][::-1]  # Top 10 features
+        top_features = []
+        for idx in top_indices:
+            if feature_weights[idx] > 0:
+                top_features.append({
+                    'feature': feature_names[idx],
+                    'weight': float(feature_weights[idx]),
+                    'impact': 'anomaly' if prediction_proba[1] > 0.5 else 'normal'
+                })
+        return {
+            'top_contributing_features': top_features[:5],
+            'explanation': "Features with higher weights contributed more to the prediction"
+        }
+    except Exception as e:
+        return {
+            'error': f"Could not generate explanation: {str(e)}",
+            'explanation': "Feature importance analysis failed"
+        }
+@app.post("/predict", response_model=PredictionResponse)
+async def predict_anomaly(request: LogRequest):
+    """Predict if a log indicates an anomaly"""
+    global model, vectorizer
+    if model is None or vectorizer is None:
+        raise HTTPException(status_code=500, detail="Model not loaded. Please ensure model files exist.")
+    try:
+        # Preprocess log
+        cleaned_log = clean_log(request.log_content)
+        log_vector = vectorizer.transform([cleaned_log])
+        # Make prediction
+        prediction = model.predict(log_vector)[0]
+        probabilities = model.predict_proba(log_vector)[0]
+        response = PredictionResponse(
+            is_anomaly=bool(prediction),
+            confidence=float(max(probabilities)),
+            anomaly_probability=float(probabilities[1]) if len(probabilities) > 1 else 0.0
+        )
+        # Add explanation if requested
+        if request.include_explanation:
+            response.explanation = get_feature_importance_explanation(request.log_content, probabilities)
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "model_loaded": model is not None,
+        "vectorizer_loaded": vectorizer is not None
+    }
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "CI/CD Log Anomaly Detection API",
+        "version": "1.0.0",
+        "endpoints": {
+            "predict": "/predict (POST)",
+            "health": "/health (GET)",
+            "docs": "/docs (GET)"
+        }
+    }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

calculator.py ADDED Viewed

	@@ -0,0 +1,15 @@

+def add(x, y):
+    return x + y
+def subtract(x, y):
+    return x - y
+def divide(x, y):
+    # Intentionally buggy - no zero handling
+    return x / y
+def multiply(x, y):
+    return x * y
+if __name__ == "__main__":
+    print("Calculator module loaded")

collect_logs.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import requests
+import json
+import zipfile
+import io
+from datetime import datetime
+class GitHubLogCollector:
+    def __init__(self, repo_owner, repo_name, token):
+        self.repo_owner = repo_owner
+        self.repo_name = repo_name
+        self.token = token
+        self.headers = {
+            'Authorization': f'token {token}',
+            'Accept': 'application/vnd.github.v3+json'
+        }
+    def get_workflow_runs(self, workflow_id='main.yml'):
+        url = f'https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/actions/workflows/{workflow_id}/runs'
+        response = requests.get(url, headers=self.headers)
+        return response.json()
+    def download_log(self, run_id):
+        url = f'https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/actions/runs/{run_id}/logs'
+        response = requests.get(url, headers=self.headers)
+        if response.status_code == 200:
+            # Extract zip content
+            with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
+                log_content = ""
+                for file_name in zip_file.namelist():
+                    with zip_file.open(file_name) as file:
+                        log_content += file.read().decode('utf-8') + "\n"
+                return log_content
+        return None
+    def collect_all_logs(self):
+        os.makedirs('data/normal', exist_ok=True)
+        os.makedirs('data/anomalous', exist_ok=True)
+        runs = self.get_workflow_runs()
+        for run in runs['workflow_runs']:
+            run_id = run['id']
+            conclusion = run['conclusion']
+            created_at = run['created_at']
+            log_content = self.download_log(run_id)
+            if log_content:
+                # Clean filename
+                timestamp = datetime.fromisoformat(created_at.replace('Z', '+00:00')).strftime('%Y%m%d_%H%M%S')
+                if conclusion == 'success':
+                    filename = f'data/normal/run_{run_id}_{timestamp}.log'
+                else:
+                    filename = f'data/anomalous/run_{run_id}_{timestamp}.log'
+                with open(filename, 'w') as f:
+                    f.write(log_content)
+                print(f"Saved: {filename} (Status: {conclusion})")
+# Usage
+if __name__ == "__main__":
+    # Replace with your values
+    REPO_OWNER = "your-username"
+    REPO_NAME = "your-repo-name"
+    GITHUB_TOKEN = "your-github-token"  # Create at https://github.com/settings/tokens
+    collector = GitHubLogCollector(REPO_OWNER, REPO_NAME, GITHUB_TOKEN)
+    collector.collect_all_logs()

preprocess_logs.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import re
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+import joblib
+import numpy as np
+class LogPreprocessor:
+    def __init__(self):
+        self.vectorizer = TfidfVectorizer(
+            max_features=1000,
+            stop_words='english',
+            ngram_range=(1, 2),
+            min_df=2
+        )
+    def clean_log(self, log_content):
+        # Remove timestamps
+        log_content = re.sub(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z', '[TIMESTAMP]', log_content)
+        # Remove specific IDs and numbers
+        log_content = re.sub(r'run_\d+', 'run_ID', log_content)
+        log_content = re.sub(r'job_\d+', 'job_ID', log_content)
+        log_content = re.sub(r'\b\d{8,}\b', '[ID]', log_content)
+        # Remove paths that might be environment-specific
+        log_content = re.sub(r'/home/[^/\s]+', '/home/USER', log_content)
+        log_content = re.sub(r'/tmp/[^/\s]+', '/tmp/TEMP', log_content)
+        # Normalize whitespace
+        log_content = re.sub(r'\s+', ' ', log_content)
+        return log_content.strip()
+    def load_logs(self, data_dir):
+        logs = []
+        labels = []
+        # Load normal logs
+        normal_dir = os.path.join(data_dir, 'normal')
+        if os.path.exists(normal_dir):
+            for filename in os.listdir(normal_dir):
+                filepath = os.path.join(normal_dir, filename)
+                with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                    content = f.read()
+                    logs.append(self.clean_log(content))
+                    labels.append(0)  # 0 for normal
+        # Load anomalous logs
+        anomalous_dir = os.path.join(data_dir, 'anomalous')
+        if os.path.exists(anomalous_dir):
+            for filename in os.listdir(anomalous_dir):
+                filepath = os.path.join(anomalous_dir, filename)
+                with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+                    content = f.read()
+                    logs.append(self.clean_log(content))
+                    labels.append(1)  # 1 for anomalous
+        return logs, labels
+class AnomalyDetectionModel:
+    def __init__(self):
+        self.preprocessor = LogPreprocessor()
+        self.model = RandomForestClassifier(
+            n_estimators=100,
+            random_state=42,
+            class_weight='balanced'
+        )
+        self.is_trained = False
+    def train(self, data_dir='data'):
+        print("Loading logs...")
+        logs, labels = self.preprocessor.load_logs(data_dir)
+        if len(logs) == 0:
+            raise ValueError("No logs found. Please run data collection first.")
+        print(f"Loaded {len(logs)} logs ({labels.count(0)} normal, {labels.count(1)} anomalous)")
+        # Vectorize logs
+        print("Vectorizing logs...")
+        X = self.preprocessor.vectorizer.fit_transform(logs)
+        y = np.array(labels)
+        # Split data
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        # Train model
+        print("Training model...")
+        self.model.fit(X_train, y_train)
+        # Evaluate
+        y_pred = self.model.predict(X_test)
+        print("\nModel Performance:")
+        print(classification_report(y_test, y_pred, target_names=['Normal', 'Anomalous']))
+        print("\nConfusion Matrix:")
+        print(confusion_matrix(y_test, y_pred))
+        self.is_trained = True
+        # Save model and vectorizer
+        joblib.dump(self.model, 'anomaly_model.pkl')
+        joblib.dump(self.preprocessor.vectorizer, 'vectorizer.pkl')
+        print("\nModel saved as 'anomaly_model.pkl' and 'vectorizer.pkl'")
+    def predict(self, log_content):
+        if not self.is_trained:
+            # Load saved model
+            try:
+                self.model = joblib.load('anomaly_model.pkl')
+                self.preprocessor.vectorizer = joblib.load('vectorizer.pkl')
+                self.is_trained = True
+            except FileNotFoundError:
+                raise ValueError("No trained model found. Please train first.")
+        # Preprocess and predict
+        cleaned_log = self.preprocessor.clean_log(log_content)
+        log_vector = self.preprocessor.vectorizer.transform([cleaned_log])
+        prediction = self.model.predict(log_vector)[0]
+        probability = self.model.predict_proba(log_vector)[0]
+        return {
+            'is_anomaly': bool(prediction),
+            'confidence': float(max(probability)),
+            'anomaly_probability': float(probability[1]) if len(probability) > 1 else 0.0
+        }
+if __name__ == "__main__":
+    model = AnomalyDetectionModel()
+    model.train()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+scikit-learn==1.3.0
+pandas==2.0.3
+numpy==1.24.3
+requests==2.31.0
+python-dotenv==1.0.0
+fastapi==0.100.1
+uvicorn==0.23.2
+gradio==3.44.0
+PyGithub==1.59.0
+tqdm==4.65.0

test_calculator.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import random
+from calculator import add, subtract, divide, multiply
+def test_add():
+    assert add(2, 3) == 5
+    assert add(-1, 1) == 0
+def test_subtract():
+    assert subtract(5, 3) == 2
+    assert subtract(0, 5) == -5
+def test_multiply():
+    assert multiply(3, 4) == 12
+    assert multiply(0, 10) == 0
+def test_divide():
+    assert divide(10, 2) == 5
+    assert divide(9, 3) == 3
+# Flaky test that fails randomly
+def test_flaky():
+    if random.random() < 0.3:  # 30% chance to fail
+        assert False, "Flaky test failed randomly"
+    assert True
+# Always failing test (comment out initially)
+# def test_always_fail():
+#     assert 1 == 2, "This test always fails"

test_system.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python3
+import subprocess
+import time
+import requests
+import json
+def run_command(cmd):
+    """Run a shell command and return the result"""
+    try:
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        return result.returncode == 0, result.stdout, result.stderr
+    except Exception as e:
+        return False, "", str(e)
+def test_api_locally():
+    """Test the FastAPI service locally"""
+    print("Testing API locally...")
+    # Start the API in background
+    print("Starting API server...")
+    api_process = subprocess.Popen(
+        ["python", "api_service.py"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    # Wait for server to start
+    time.sleep(5)
+    try:
+        # Test health endpoint
+        response = requests.get("http://localhost:8000/health")
+        print(f"Health check: {response.status_code}")
+        # Test prediction with sample log
+        sample_log = """
+        2023-10-27T12:00:00.000Z [INFO] Starting CI pipeline
+        2023-10-27T12:00:01.000Z [INFO] Checkout successful
+        2023-10-27T12:00:02.000Z [ERROR] pytest failed
+        2023-10-27T12:00:03.000Z [ERROR] Test test_divide failed: ZeroDivisionError
+        """
+        response = requests.post(
+            "http://localhost:8000/predict",
+            json={
+                "log_content": sample_log,
+                "include_explanation": True
+            }
+        )
+        if response.status_code == 200:
+            result = response.json()
+            print(f"Prediction test successful:")
+            print(json.dumps(result, indent=2))
+        else:
+            print(f"Prediction test failed: {response.status_code}")
+    except requests.RequestException as e:
+        print(f"API test failed: {e}")
+    finally:
+        # Stop the API server
+        api_process.terminate()
+        api_process.wait()
+def main():
+    print("🚀 Starting CI/CD Anomaly Detection System Test")
+    print("=" * 50)
+    # Test 1: Check if model files exist
+    print("1. Checking for model files...")
+    try:
+        import joblib
+        model = joblib.load('anomaly_model.pkl')
+        vectorizer = joblib.load('vectorizer.pkl')
+        print("✅ Model files found and loaded")
+    except FileNotFoundError:
+        print("❌ Model files not found. Run training first:")
+        print("   python preprocess_logs.py")
+        return
+    # Test 2: Test the model directly
+    print("\n2. Testing model directly...")
+    from preprocess_logs import AnomalyDetectionModel
+    model = AnomalyDetectionModel()
+    # Test with normal log
+    normal_log = "INFO: All tests passed successfully"
+    result = model.predict(normal_log)
+    print(f"Normal log prediction: {result}")
+    # Test with anomalous log
+    anomalous_log = "ERROR: Test failed with ZeroDivisionError: division by zero"
+    result = model.predict(anomalous_log)
+    print(f"Anomalous log prediction: {result}")
+    # Test 3: Test API
+    print("\n3. Testing API service...")
+    test_api_locally()
+    print("\n🎉 System test completed!")
+    print("\nNext steps:")
+    print("1. Deploy your API to a cloud service")
+    print("2. Set MODEL_API_URL secret in your GitHub repo")
+    print("3. Push some code changes to trigger the workflows")
+if __name__ == "__main__":
+    main()