Spaces:

minhvtt
/

Aus_F

Sleeping

App Files Files Community

minhvtt commited on Nov 24, 2025

Commit

34b2632

verified ·

1 Parent(s): d97be90

Upload 15 files

Browse files

Files changed (15) hide show

models/__init__.py +1 -0
models/event_models.py +112 -0
models/segmentation_models.py +69 -0
models/sentiment_models.py +82 -0
scripts/__init__.py +1 -0
scripts/create_indexes.py +111 -0
services/__init__.py +1 -0
services/data_aggregation.py +188 -0
services/feedback.py +224 -0
services/genai_service.py +332 -0
services/model_registry.py +191 -0
services/monitoring.py +232 -0
services/preprocessing.py +191 -0
services/segmentation_service.py +292 -0
services/sentiment_service.py +220 -0

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # MongoDB Models for Audience Segmentation AI Features

models/event_models.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Event-Centric Pydantic Models for MongoDB
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Define schemas for event-specific analysis results
+"""
+from pydantic import BaseModel, Field
+from typing import List, Dict, Optional, Any
+from datetime import datetime
+from bson import ObjectId
+class PyObjectId(ObjectId):
+    """Custom ObjectId type for Pydantic"""
+    @classmethod
+    def __get_validators__(cls):
+        yield cls.validate
+    @classmethod
+    def validate(cls, v):
+        if not ObjectId.is_valid(v):
+            raise ValueError("Invalid ObjectId")
+        return ObjectId(v)
+    @classmethod
+    def __modify_schema__(cls, field_schema):
+        field_schema.update(type="string")
+class MarketingContent(BaseModel):
+    """Marketing email content generated by AI"""
+    email_subject: str
+    email_body: str
+    status: str = "Draft"  # Draft, Approved, Sent
+    generated_at: datetime = Field(default_factory=datetime.utcnow)
+    approved_at: Optional[datetime] = None
+    approved_by: Optional[str] = None
+class EventAudienceSegment(BaseModel):
+    """
+    Audience segment specific to an event.
+    Stores clustering results and marketing content for Event Owner review.
+    """
+    id: Optional[PyObjectId] = Field(default=None, alias="_id")
+    event_code: str = Field(..., description="Event identifier")
+    segment_name: str = Field(..., description="Human-readable segment name in Vietnamese")
+    segment_type: str = Field(..., description="Segment category (e.g., VIP, Potential, Dormant)")
+    user_count: int = Field(..., description="Number of users in this segment")
+    user_ids: List[PyObjectId] = Field(default_factory=list, description="List of user ObjectIds in this segment")
+    criteria: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Average statistics for this segment (e.g., avg_spend, avg_tickets, avg_recency)"
+    )
+    marketing_content: Optional[MarketingContent] = Field(
+        default=None,
+        description="AI-generated marketing email (Draft, pending approval)"
+    )
+    created_at: datetime = Field(default_factory=datetime.utcnow)
+    last_updated: datetime = Field(default_factory=datetime.utcnow)
+    class Config:
+        populate_by_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}
+class AIInsights(BaseModel):
+    """AI-generated insights from sentiment analysis"""
+    summary: str = Field(..., description="Overall sentiment summary in Vietnamese")
+    top_issues: List[str] = Field(default_factory=list, description="Top 5 recurring issues")
+    improvement_suggestions: List[str] = Field(default_factory=list, description="Actionable suggestions")
+    predicted_nps: Optional[float] = Field(None, description="Predicted Net Promoter Score (0-100)")
+class EventSentimentSummary(BaseModel):
+    """
+    Aggregated sentiment analysis summary for an event.
+    Provides Event Owner with quick insights about attendee feedback.
+    """
+    id: Optional[PyObjectId] = Field(default=None, alias="_id")
+    event_code: str = Field(..., description="Event identifier")
+    total_comments: int = Field(default=0, description="Total number of comments analyzed")
+    sentiment_distribution: Dict[str, int] = Field(
+        default_factory=dict,
+        description="Count of Positive, Negative, Neutral comments"
+    )
+    avg_confidence: float = Field(default=0.0, description="Average confidence score of sentiment predictions")
+    top_keywords: List[str] = Field(
+        default_factory=list,
+        description="Most frequently mentioned keywords/phrases"
+    )
+    ai_insights: Optional[AIInsights] = Field(
+        default=None,
+        description="AI-generated insights and recommendations"
+    )
+    last_updated: datetime = Field(default_factory=datetime.utcnow)
+    class Config:
+        populate_by_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}

models/segmentation_models.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+MongoDB Models for Audience Segmentation
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Define data models for storing audience segmentation results
+"""
+from datetime import datetime
+from typing import Optional, List, Dict
+from pydantic import BaseModel, Field
+from bson import ObjectId
+class PyObjectId(ObjectId):
+    """Custom ObjectId type for Pydantic"""
+    @classmethod
+    def __get_validators__(cls):
+        yield cls.validate
+    @classmethod
+    def validate(cls, v):
+        if not ObjectId.is_valid(v):
+            raise ValueError("Invalid ObjectId")
+        return ObjectId(v)
+    @classmethod
+    def __modify_schema__(cls, field_schema):
+        field_schema.update(type="string")
+class AudienceSegment(BaseModel):
+    """
+    Defines the characteristics of an audience segment.
+    This is the result of K-Means clustering on user behavior data.
+    """
+    id: Optional[PyObjectId] = Field(default_factory=PyObjectId, alias="_id")
+    segment_name: str = Field(..., description="Human-readable segment name, e.g., 'Big Spenders', 'Music Lovers'")
+    description: Optional[str] = Field(None, description="Detailed description of this segment")
+    criteria: Dict = Field(default_factory=dict, description="Statistical criteria: min_spend, max_spend, top_categories, etc.")
+    user_count: int = Field(0, description="Number of users in this segment")
+    last_updated: datetime = Field(default_factory=datetime.utcnow)
+    # Generative AI Output
+    marketing_content: Optional[Dict] = Field(
+        None,
+        description="AI-generated marketing content: { 'email_subject': str, 'email_body': str }"
+    )
+    class Config:
+        allow_population_by_field_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}
+class UserSegmentAssignment(BaseModel):
+    """
+    Links a user to their assigned segment.
+    Many-to-one relationship: many users belong to one segment.
+    """
+    id: Optional[PyObjectId] = Field(default_factory=PyObjectId, alias="_id")
+    user_id: PyObjectId = Field(..., description="Reference to User._id")
+    segment_id: PyObjectId = Field(..., description="Reference to AudienceSegment._id")
+    confidence_score: float = Field(..., description="Distance to cluster center (lower is better)")
+    assigned_at: datetime = Field(default_factory=datetime.utcnow)
+    class Config:
+        allow_population_by_field_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}

models/sentiment_models.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Sentiment Analysis Pydantic Models for MongoDB
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Define schemas for sentiment analysis results
+"""
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict
+from datetime import datetime
+from bson import ObjectId
+class PyObjectId(ObjectId):
+    """Custom ObjectId type for Pydantic"""
+    @classmethod
+    def __get_validators__(cls):
+        yield cls.validate
+    @classmethod
+    def validate(cls, v):
+        if not ObjectId.is_valid(v):
+            raise ValueError("Invalid ObjectId")
+        return ObjectId(v)
+    @classmethod
+    def __modify_schema__(cls, field_schema):
+        field_schema.update(type="string")
+class SentimentAnalysisResult(BaseModel):
+    """Individual sentiment analysis result for a comment/feedback"""
+    id: Optional[PyObjectId] = Field(default=None, alias="_id")
+    source_id: PyObjectId = Field(..., description="ID of the original comment/post")
+    source_type: str = Field(default="UserCommentPost", description="Type of source")
+    # NEW: Event context
+    event_code: str = Field(..., description="Event identifier this comment belongs to")
+    sentiment_label: str = Field(..., description="Positive, Negative, or Neutral")
+    confidence_score: float = Field(..., ge=0.0, le=1.0, description="Model confidence (0-1)")
+    key_phrases: List[str] = Field(
+        default_factory=list,
+        description="Extracted keywords/phrases from the text"
+    )
+    analyzed_at: datetime = Field(default_factory=datetime.utcnow)
+    class Config:
+        populate_by_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}
+class EventInsightReport(BaseModel):
+    """
+    High-level insights for an event, generated by LLM.
+    Includes Top 5 issues, NPS prediction, and improvement suggestions.
+    """
+    id: Optional[PyObjectId] = Field(default=None, alias="_id")
+    event_code: str = Field(..., description="Reference to EventVersion.EventCode")
+    report_date: datetime = Field(default_factory=datetime.utcnow)
+    total_comments: int = Field(0, description="Total number of comments analyzed")
+    sentiment_breakdown: Dict[str, int] = Field(
+        default_factory=dict,
+        description="Count by sentiment: { 'Positive': 50, 'Negative': 10, 'Neutral': 20 }"
+    )
+    predicted_nps: Optional[float] = Field(None, description="Predicted NPS score (0-100)")
+    top_issues: List[str] = Field(
+        default_factory=list,
+        description="Top 5 recurring issues, e.g., ['Check-in slow', 'Sound quality poor']"
+    )
+    improvement_suggestions: List[str] = Field(
+        default_factory=list,
+        description="AI-generated suggestions for improvement"
+    )
+    class Config:
+        populate_by_name = True
+        arbitrary_types_allowed = True
+        json_encoders = {ObjectId: str}

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # scripts/__init__.py - marker file

scripts/create_indexes.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+MongoDB Index Creation Script
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Create performance indexes for event-centric queries
+"""
+from database import db
+from config import settings
+def create_all_indexes():
+    """
+    Create all necessary indexes for optimal performance.
+    Run this once during deployment or when setting up a new environment.
+    """
+    print("=" * 60)
+    print("🔧 Creating MongoDB Indexes")
+    print("=" * 60)
+    # Payment Collection Indexes
+    print("\n📊 Payment Collection:")
+    # Index for event-specific ticket purchases
+    db.payments.create_index(
+        [("EventCode", 1), ("Status", 1), ("UserId", 1)],
+        name="idx_payment_event_status_user"
+    )
+    print("  ✓ Created: idx_payment_event_status_user")
+    # Index for user RFM calculation
+    db.payments.create_index(
+        [("UserId", 1), ("TransactionDate", -1)],
+        name="idx_payment_user_date"
+    )
+    print("  ✓ Created: idx_payment_user_date")
+    # UserFollow Collection Indexes
+    print("\n👥 UserFollow Collection:")
+    # Index for event followers
+    db.user_follows.create_index(
+        [("EventCode", 1), ("userId", 1)],
+        name="idx_follow_event_user"
+    )
+    print("  ✓ Created: idx_follow_event_user")
+    # UserCommentPost Collection Indexes
+    print("\n💬 UserCommentPost Collection:")
+    # Index for event comments
+    db.user_comment_post.create_index(
+        [("EventCode", 1), ("CreatedDate", -1)],
+        name="idx_comment_event_date"
+    )
+    print("  ✓ Created: idx_comment_event_date")
+    # EventAudienceSegment Collection Indexes
+    print("\n🎯 EventAudienceSegment Collection:")
+    # Index for event owner dashboard
+    db.event_audience_segments.create_index(
+        [("event_code", 1)],
+        name="idx_segment_event"
+    )
+    print("  ✓ Created: idx_segment_event")
+    # Index for status filtering
+    db.event_audience_segments.create_index(
+        [("event_code", 1), ("marketing_content.status", 1)],
+        name="idx_segment_event_status"
+    )
+    print("  ✓ Created: idx_segment_event_status")
+    # EventSentimentSummary Collection Indexes
+    print("\n📊 EventSentimentSummary Collection:")
+    # Index for event sentiment lookup
+    db.event_sentiment_summary.create_index(
+        [("event_code", 1), ("last_updated", -1)],
+        name="idx_sentiment_event_date"
+    )
+    print("  ✓ Created: idx_sentiment_event_date")
+    # SentimentAnalysisResult Collection Indexes
+    print("\n💭 SentimentAnalysisResult Collection:")
+    # Index for event-specific sentiment results
+    db.sentiment_results.create_index(
+        [("event_code", 1), ("analyzed_at", -1)],
+        name="idx_sentiment_result_event_date"
+    )
+    print("  ✓ Created: idx_sentiment_result_event_date")
+    print("\n" + "=" * 60)
+    print("✅ All Indexes Created Successfully!")
+    print("=" * 60)
+    # List all indexes for verification
+    print("\n📋 Index Summary:")
+    print(f"  Payment: {len(list(db.payments.list_indexes()))} indexes")
+    print(f"  UserFollow: {len(list(db.user_follows.list_indexes()))} indexes")
+    print(f"  UserCommentPost: {len(list(db.user_comment_post.list_indexes()))} indexes")
+    print(f"  EventAudienceSegment: {len(list(db.event_audience_segments.list_indexes()))} indexes")
+    print(f"  EventSentimentSummary: {len(list(db.event_sentiment_summary.list_indexes()))} indexes")
+    print(f"  SentimentAnalysisResult: {len(list(db.sentiment_results.list_indexes()))} indexes")
+if __name__ == "__main__":
+    create_all_indexes()

services/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Services Package

services/data_aggregation.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""
+Data Aggregation Pipeline for Event-Centric User Segmentation
+Author: AI Generated
+Created: 2025-11-24 (Refactored for event-centric analysis)
+Purpose: Aggregate user features for a specific event using MongoDB pipelines
+"""
+from typing import List, Dict
+from datetime import datetime
+from database import db
+from config import settings
+class UserDataAggregator:
+    """
+    Aggregates user behavioral data for segmentation per event.
+    Uses MongoDB Aggregation Framework to minimize data transfer.
+    """
+    def __init__(self, event_code: str):
+        """
+        Initialize aggregator for a specific event.
+        Args:
+            event_code: Event identifier to filter users
+        """
+        self.event_code = event_code
+        self.db = db
+    def aggregate_user_features(self) -> List[Dict]:
+        """
+        Aggregate user features for the specified event.
+        Returns users who:
+        1. Bought tickets for this event
+        2. Follow this event
+        3. Commented on this event
+        Returns: List of user feature vectors with event-specific context
+        """
+        pipeline = [
+            # Stage 1: Start with users who interacted with THIS event
+            {
+                "$match": {
+                    "Status": "Active"
+                }
+            },
+            # Stage 2: Lookup tickets bought for THIS EVENT
+            {
+                "$lookup": {
+                    "from": settings.COLLECTION_PAYMENTS,
+                    "let": {"user_id": "$_id"},
+                    "pipeline": [
+                        {
+                            "$match": {
+                                "$expr": {
+                                    "$and": [
+                                        {"$eq": ["$UserId", "$$user_id"]},
+                                        {"$eq": ["$EventCode", self.event_code]},
+                                        {"$eq": ["$Status", "Completed"]}
+                                    ]
+                                }
+                            }
+                        }
+                    ],
+                    "as": "event_tickets"
+                }
+            },
+            # Stage 3: Lookup follows for THIS EVENT
+            {
+                "$lookup": {
+                    "from": settings.COLLECTION_USER_FOLLOWS,
+                    "let": {"user_id": "$_id"},
+                    "pipeline": [
+                        {
+                            "$match": {
+                                "$expr": {
+                                    "$and": [
+                                        {"$eq": ["$userId", "$$user_id"]},
+                                        {"$eq": ["$EventCode", self.event_code]}
+                                    ]
+                                }
+                            }
+                        }
+                    ],
+                    "as": "event_follows"
+                }
+            },
+            # Stage 4: Lookup all payments for global RFM (user lifetime value)
+            {
+                "$lookup": {
+                    "from": settings.COLLECTION_PAYMENTS,
+                    "localField": "_id",
+                    "foreignField": "UserId",
+                    "as": "all_payments",
+                    "pipeline": [
+                        {
+                            "$match": {
+                                "Status": "Completed"
+                            }
+                        }
+                    ]
+                }
+            },
+            # Stage 5: Filter users who interacted with this event
+            {
+                "$match": {
+                    "$or": [
+                        {"event_tickets": {"$ne": []}},
+                        {"event_follows": {"$ne": []}}
+                    ]
+                }
+            },
+            # Stage 6: Calculate event-specific metrics
+            {
+                "$addFields": {
+                    # Event-specific: tickets bought for THIS event
+                    "event_ticket_count": {"$size": "$event_tickets"},
+                    "event_total_spend": {"$sum": "$event_tickets.Amount"},
+                    # Event-specific: follow status
+                    "is_follower": {
+                        "$cond": [
+                            {"$gt": [{"$size": "$event_follows"}, 0]},
+                            1,
+                            0
+                        ]
+                    },
+                    # Global RFM: user's overall purchasing power
+                    "global_total_spend": {"$sum": "$all_payments.Amount"},
+                    "global_transaction_count": {"$size": "$all_payments"},
+                    "global_last_transaction": {"$max": "$all_payments.TransactionDate"}
+                }
+            },
+            # Stage 7: Calculate global recency
+            {
+                "$addFields": {
+                    "global_recency_days": {
+                        "$cond": {
+                            "if": {"$gt": ["$global_last_transaction", None]},
+                            "then": {
+                                "$dateDiff": {
+                                    "startDate": "$global_last_transaction",
+                                    "endDate": "$$NOW",
+                                    "unit": "day"
+                                }
+                            },
+                            "else": 999999
+                        }
+                    }
+                }
+            },
+            # Stage 8: Project final feature vector
+            {
+                "$project": {
+                    "_id": 1,
+                    "user_id": "$_id",
+                    "email": 1,
+                    "firstName": "$FirstName",
+                    "lastName": "$LastName",
+                    # Event-specific features
+                    "event_ticket_count": 1,
+                    "event_total_spend": 1,
+                    "is_follower": 1,
+                    # Global features (user power)
+                    "global_recency": "$global_recency_days",
+                    "global_frequency": "$global_transaction_count",
+                    "global_monetary": "$global_total_spend"
+                }
+            }
+        ]
+        print(f"🔄 Running aggregation for event: {self.event_code}")
+        results = list(self.db.users.aggregate(pipeline, allowDiskUse=True))
+        print(f"✓ Found {len(results)} users who interacted with this event")
+        return results

services/feedback.py ADDED Viewed

	@@ -0,0 +1,224 @@

+"""
+Feedback Loop System
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Collect feedback metrics to improve AI models over time
+"""
+from datetime import datetime
+from typing import Dict, Optional
+from bson import ObjectId
+from database import db
+class FeedbackCollector:
+    """
+    Collect feedback on AI outputs for continuous improvement.
+    """
+    def __init__(self):
+        self.collection = "AIFeedback"
+    def record_email_engagement(self,
+                                segment_id: str,
+                                user_id: str,
+                                opened: bool = False,
+                                clicked: bool = False,
+                                converted: bool = False,
+                                unsubscribed: bool = False):
+        """
+        Record email engagement metrics.
+        Used to evaluate email generation quality.
+        """
+        doc = {
+            "feedback_type": "email_engagement",
+            "segment_id": ObjectId(segment_id),
+            "user_id": ObjectId(user_id),
+            "opened": opened,
+            "clicked": clicked,
+            "converted": converted,
+            "unsubscribed": unsubscribed,
+            "timestamp": datetime.utcnow()
+        }
+        db.get_collection(self.collection).insert_one(doc)
+    def record_sentiment_correction(self,
+                                   analysis_id: str,
+                                   original_label: str,
+                                   corrected_label: str,
+                                   corrected_by: str):
+        """
+        Record manual corrections to sentiment analysis.
+        Used to fine-tune PhoBERT.
+        """
+        doc = {
+            "feedback_type": "sentiment_correction",
+            "analysis_id": ObjectId(analysis_id),
+            "original_label": original_label,
+            "corrected_label": corrected_label,
+            "corrected_by": corrected_by,
+            "timestamp": datetime.utcnow()
+        }
+        db.get_collection(self.collection).insert_one(doc)
+    def record_segment_feedback(self,
+                                segment_id: str,
+                                user_id: str,
+                                interaction_type: str,
+                                value: Optional[float] = None):
+        """
+        Record user interactions with segment-targeted campaigns.
+        interaction_type: 'purchase', 'view', 'ignore', etc.
+        value: revenue/engagement metric
+        """
+        doc = {
+            "feedback_type": "segment_interaction",
+            "segment_id": ObjectId(segment_id),
+            "user_id": ObjectId(user_id),
+            "interaction_type": interaction_type,
+            "value": value,
+            "timestamp": datetime.utcnow()
+        }
+        db.get_collection(self.collection).insert_one(doc)
+    def record_insight_usefulness(self,
+                                  insight_report_id: str,
+                                  user_id: str,
+                                  rating: int,
+                                  implemented: bool = False):
+        """
+        Record how useful an insight report was.
+        rating: 1-5 stars
+        """
+        doc = {
+            "feedback_type": "insight_rating",
+            "insight_report_id": ObjectId(insight_report_id),
+            "user_id": user_id,
+            "rating": rating,
+            "implemented": implemented,
+            "timestamp": datetime.utcnow()
+        }
+        db.get_collection(self.collection).insert_one(doc)
+    def get_email_performance(self, segment_id: str) -> Dict:
+        """
+        Get aggregated email performance for a segment.
+        """
+        pipeline = [
+            {
+                "$match": {
+                    "feedback_type": "email_engagement",
+                    "segment_id": ObjectId(segment_id)
+                }
+            },
+            {
+                "$group": {
+                    "_id": None,
+                    "total_sent": {"$sum": 1},
+                    "opened": {"$sum": {"$cond": ["$opened", 1, 0]}},
+                    "clicked": {"$sum": {"$cond": ["$clicked", 1, 0]}},
+                    "converted": {"$sum": {"$cond": ["$converted", 1, 0]}},
+                    "unsubscribed": {"$sum": {"$cond": ["$unsubscribed", 1, 0]}}
+                }
+            }
+        ]
+        results = list(db.get_collection(self.collection).aggregate(pipeline))
+        if not results:
+            return {"error": "No data"}
+        data = results[0]
+        total = data["total_sent"]
+        return {
+            "total_sent": total,
+            "open_rate": data["opened"] / total if total > 0 else 0,
+            "click_rate": data["clicked"] / total if total > 0 else 0,
+            "conversion_rate": data["converted"] / total if total > 0 else 0,
+            "unsubscribe_rate": data["unsubscribed"] / total if total > 0 else 0
+        }
+    def get_sentiment_accuracy(self) -> Dict:
+        """
+        Calculate sentiment analysis accuracy based on corrections.
+        """
+        corrections = list(db.get_collection(self.collection).find({
+            "feedback_type": "sentiment_correction"
+        }))
+        if not corrections:
+            return {"error": "No corrections recorded"}
+        total = len(corrections)
+        correct = sum(1 for c in corrections if c["original_label"] == c["corrected_label"])
+        accuracy = correct / total
+        # Breakdown by label
+        by_label = {}
+        for c in corrections:
+            label = c["original_label"]
+            if label not in by_label:
+                by_label[label] = {"total": 0, "correct": 0}
+            by_label[label]["total"] += 1
+            if c["original_label"] == c["corrected_label"]:
+                by_label[label]["correct"] += 1
+        for label in by_label:
+            data = by_label[label]
+            by_label[label]["accuracy"] = data["correct"] / data["total"]
+        return {
+            "overall_accuracy": accuracy,
+            "total_corrections": total,
+            "by_label": by_label
+        }
+    def get_retaining_dataset(self) -> tuple:
+        """
+        Get dataset for retraining sentiment model from corrections.
+        Returns: (texts, labels)
+        """
+        corrections = list(db.get_collection(self.collection).find({
+            "feedback_type": "sentiment_correction"
+        }))
+        # Fetch original texts
+        analysis_ids = [c["analysis_id"] for c in corrections]
+        analyses = {
+            str(a["_id"]): a
+            for a in db.sentiment_results.find({"_id": {"$in": analysis_ids}})
+        }
+        # Get comment texts
+        source_ids = [analyses[str(c["analysis_id"])]["source_id"] for c in corrections if str(c["analysis_id"]) in analyses]
+        comments = {
+            str(c["_id"]): c.get("CommentText", "")
+            for c in db.user_comment_post.find({"_id": {"$in": source_ids}})
+        }
+        # Build training data
+        texts = []
+        labels = []
+        for c in corrections:
+            analysis_id_str = str(c["analysis_id"])
+            if analysis_id_str in analyses:
+                source_id_str = str(analyses[analysis_id_str]["source_id"])
+                if source_id_str in comments:
+                    texts.append(comments[source_id_str])
+                    labels.append(c["corrected_label"])
+        print(f"✓ Built retraining dataset: {len(texts)} samples")
+        return texts, labels
+# Global feedback collector
+feedback = FeedbackCollector()

services/genai_service.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+Event-Centric Generative AI Service
+Author: AI Generated
+Created: 2025-11-24 (Refactored)
+Purpose: Generate marketing content and insights with event context
+"""
+from llama_cpp import Llama
+from typing import Dict, List
+from datetime import datetime
+from bson import ObjectId
+from database import db
+from config import settings
+from models.event_models import EventSentimentSummary, AIInsights, MarketingContent
+from services.monitoring import monitor
+from services.model_registry import registry
+class GenerativeAIService:
+    """
+    Event-centric GenAI using Vistral-7B-Chat.
+    """
+    def __init__(self, event_code: str):
+        """
+        Initialize for a specific event.
+        Args:
+            event_code: Event identifier
+        """
+        self.event_code = event_code
+        self.model_path = settings.LLM_LOCAL_PATH
+        self.llm = None
+    def load_model(self):
+        """Load Vistral-7B-Chat model"""
+        print(f"🔄 Loading Vistral-7B-Chat from {self.model_path}")
+        self.llm = Llama(
+            model_path=self.model_path,
+            n_ctx=2048,
+            n_threads=4,
+            n_gpu_layers=0  # CPU only
+        )
+        print("✓ Model loaded")
+    def generate_email_for_segment(self, segment: Dict) -> MarketingContent:
+        """
+        Generate personalized email for a segment.
+        Task: NLG for Personalization with Event Context
+        """
+        if not self.llm:
+            self.load_model()
+        # Get event info
+        event = db.event_versions.find_one({"_id": self.event_code})
+        event_name = event.get("EventName", "Sự kiện") if event else "Sự kiện"
+        # Build prompt with event context
+        prompt = f"""Bạn là chuyên gia marketing sự kiện.
+Sự kiện: {event_name}
+Mã sự kiện: {self.event_code}
+Phân khúc khách hàng: {segment['segment_name']}
+Đặc điểm:
+- Số vé trung bình: {segment['criteria'].get('event_tickets', 0):.1f}
+- Chi tiêu trung bình: {segment['criteria'].get('event_spend', 0):,.0f} VNĐ
+- Giá trị khách hàng toàn cục: {segment['criteria'].get('global_monetary', 0):,.0f} VNĐ
+Nhiệm vụ: Tạo email marketing cá nhân hóa cho phân khúc này.
+Định dạng:
+SUBJECT: [tiêu đề email hấp dẫn]
+BODY:
+[nội dung email bằng tiếng Việt, 3-4 đoạn văn, tập trung vào giá trị cho khách hàng]
+"""
+        response = self.llm(
+            prompt,
+            max_tokens=512,
+            temperature=0.7,
+            stop=["</s>", "###"]
+        )
+        generated_text = response['choices'][0]['text']
+        # Parse response
+        lines = generated_text.split('\n')
+        subject = ""
+        body_lines = []
+        for line in lines:
+            if line.startswith("SUBJECT:"):
+                subject = line.replace("SUBJECT:", "").strip()
+            elif line.startswith("BODY:"):
+                continue
+            elif line.strip():
+                body_lines.append(line.strip())
+        if not subject:
+            subject = f"Ưu đãi đặc biệt cho {segment['segment_name']}"
+        body = "\n\n".join(body_lines) if body_lines else generated_text
+        return MarketingContent(
+            email_subject=subject,
+            email_body=body,
+            status="Draft",
+            generated_at=datetime.utcnow()
+        )
+    def generate_emails_for_all_segments(self):
+        """
+        Generate emails for all segments of this event.
+        """
+        import time
+        start_time = time.time()
+        print("=" * 60)
+        print(f"🚀 Generating Emails for Event: {self.event_code}")
+        print("=" * 60)
+        try:
+            if not self.llm:
+                self.load_model()
+            # Find segments without marketing content
+            segments = list(db.event_audience_segments.find({
+                "event_code": self.event_code,
+                "marketing_content": None
+            }))
+            if not segments:
+                print("✓ All segments already have marketing content")
+                return
+            print(f"✓ Generating for {len(segments)} segments")
+            for segment in segments:
+                print(f"\n🔄 {segment['segment_name']}...")
+                email_content = self.generate_email_for_segment(segment)
+                # Update segment
+                db.event_audience_segments.update_one(
+                    {"_id": segment['_id']},
+                    {"$set": {
+                        "marketing_content": email_content.dict(),
+                        "last_updated": datetime.utcnow()
+                    }}
+                )
+                print(f"  ✓ Subject: {email_content.email_subject[:50]}...")
+            # Save prompt template
+            registry.save_prompt_template(
+                f"email_gen_{self.event_code}",
+                "Email generation prompt for event segments",
+                {"event_code": self.event_code, "version": "1.0"}
+            )
+            # Monitoring
+            execution_time = time.time() - start_time
+            metrics = {
+                "event_code": self.event_code,
+                "n_generated": len(segments),
+                "total_time": execution_time
+            }
+            monitor.log_genai_run("email_generation", metrics)
+            print("=" * 60)
+            print("✅ Email Generation Complete!")
+            print(f"⏱️  Time: {execution_time:.2f}s")
+            print("=" * 60)
+        except Exception as e:
+            monitor.log_error("genai_email", e, {
+                "event_code": self.event_code
+            })
+            raise
+    def generate_insights_from_sentiment(self) -> AIInsights:
+        """
+        Generate AI insights from negative comments.
+        Task: Prompted Generative Layer for Insight Extraction
+        """
+        if not self.llm:
+            self.load_model()
+        # Get negative comments
+        negative_results = list(db.sentiment_results.find({
+            "event_code": self.event_code,
+            "sentiment_label": "Negative"
+        }).limit(50))
+        if not negative_results:
+            return AIInsights(
+                summary="Không có phản hồi tiêu cực.",
+                top_issues=[],
+                improvement_suggestions=[],
+                predicted_nps=70.0
+            )
+        # Get comment texts
+        comment_ids = [r['source_id'] for r in negative_results]
+        comments = list(db.user_comment_post.find({
+            "_id": {"$in": comment_ids}
+        }))
+        negative_texts = [c.get('CommentText', '') for c in comments if c.get('CommentText')]
+        # Build prompt
+        comments_text = "\n".join([f"- {text[:100]}" for text in negative_texts[:20]])
+        prompt = f"""Bạn là chuyên gia phân tích feedback sự kiện.
+Sự kiện: {self.event_code}
+Số feedback tiêu cực: {len(negative_texts)}
+Feedback tiêu cực:
+{comments_text}
+Nhiệm vụ: Phân tích và đưa ra:
+1. TOP 5 vấn đề chính (mỗi vấn đề 1 dòng)
+2. ĐỀ XUẤT cải thiện (3-5 đề xuất cụ thể)
+3. DỰ ĐOÁN NPS (điểm từ 0-100)
+Định dạng:
+TOP_ISSUES:
+1. [vấn đề 1]
+2. [vấn đề 2]
+...
+SUGGESTIONS:
+- [đề xuất 1]
+- [đề xuất 2]
+...
+NPS: [số]
+"""
+        response = self.llm(
+            prompt,
+            max_tokens=512,
+            temperature=0.7
+        )
+        generated = response['choices'][0]['text']
+        # Parse
+        top_issues = []
+        suggestions = []
+        predicted_nps = 60.0
+        lines = generated.split('\n')
+        current_section = None
+        for line in lines:
+            line = line.strip()
+            if line.startswith("TOP_ISSUES:"):
+                current_section = "issues"
+            elif line.startswith("SUGGESTIONS:"):
+                current_section = "suggestions"
+            elif line.startswith("NPS:"):
+                try:
+                    predicted_nps = float(line.split(":")[1].strip())
+                except:
+                    pass
+            elif current_section == "issues" and (line.startswith("-") or line[0].isdigit()):
+                issue = line.lstrip("0123456789.-) ").strip()
+                if issue:
+                    top_issues.append(issue)
+            elif current_section == "suggestions" and line.startswith("-"):
+                suggestion = line.lstrip("- ").strip()
+                if suggestion:
+                    suggestions.append(suggestion)
+        # Create summary
+        total_comments = db.sentiment_results.count_documents({"event_code": self.event_code})
+        negative_pct = (len(negative_results) / total_comments * 100) if total_comments > 0 else 0
+        summary = f"Sự kiện nhận được {total_comments} phản hồi, trong đó {len(negative_results)} ({negative_pct:.1f}%) phản hồi tiêu cực. "
+        if top_issues:
+            summary += f"Vấn đề chính: {top_issues[0]}."
+        return AIInsights(
+            summary=summary,
+            top_issues=top_issues[:5],
+            improvement_suggestions=suggestions[:5],
+            predicted_nps=predicted_nps
+        )
+    def update_sentiment_summary_with_insights(self):
+        """
+        Generate and update EventSentimentSummary with AI insights.
+        """
+        print("=" * 60)
+        print(f"🚀 Generating Insights for Event: {self.event_code}")
+        print("=" * 60)
+        try:
+            insights = self.generate_insights_from_sentiment()
+            # Update summary
+            db.event_sentiment_summary.update_one(
+                {"event_code": self.event_code},
+                {"$set": {
+                    "ai_insights": insights.dict(),
+                    "last_updated": datetime.utcnow()
+                }},
+                upsert=True
+            )
+            print("✓ Insights generated:")
+            print(f"  Top Issues: {len(insights.top_issues)}")
+            print(f"  Suggestions: {len(insights.improvement_suggestions)}")
+            print(f"  Predicted NPS: {insights.predicted_nps}")
+            print("=" * 60)
+            print("✅ Insights Complete!")
+            print("=" * 60)
+            return insights
+        except Exception as e:
+            monitor.log_error("genai_insights", e, {
+                "event_code": self.event_code
+            })
+            raise

services/model_registry.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Model Registry & Versioning Module
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Track and version AI models and configurations
+"""
+import pickle
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+import hashlib
+from database import db
+class ModelRegistry:
+    """
+    Manage model versions and configurations.
+    Stores models locally and metadata in MongoDB.
+    """
+    def __init__(self, storage_dir: str = "./model_storage"):
+        self.storage_dir = Path(storage_dir)
+        self.storage_dir.mkdir(exist_ok=True)
+        self.collection = "ModelRegistry"
+    def _generate_version_id(self, model_name: str) -> str:
+        """Generate a unique version ID based on timestamp."""
+        timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+        return f"{model_name}_v{timestamp}"
+    def _calculate_hash(self, file_path: Path) -> str:
+        """Calculate MD5 hash of model file for integrity check."""
+        md5_hash = hashlib.md5()
+        with open(file_path, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                md5_hash.update(chunk)
+        return md5_hash.hexdigest()
+    def save_model(self, model: Any, model_name: str, metadata: Dict = None) -> str:
+        """
+        Save a model with versioning.
+        Args:
+            model: The model object (sklearn, torch, etc.)
+            model_name: Name identifier (e.g., "kmeans_segmentation")
+            metadata: Additional info (params, metrics, etc.)
+        Returns:
+            version_id: Unique version identifier
+        """
+        version_id = self._generate_version_id(model_name)
+        model_path = self.storage_dir / f"{version_id}.pkl"
+        # Save model file
+        with open(model_path, 'wb') as f:
+            pickle.dump(model, f)
+        # Calculate file hash
+        file_hash = self._calculate_hash(model_path)
+        # Save metadata to MongoDB
+        doc = {
+            "version_id": version_id,
+            "model_name": model_name,
+            "file_path": str(model_path),
+            "file_hash": file_hash,
+            "file_size": model_path.stat().st_size,
+            "created_at": datetime.utcnow(),
+            "metadata": metadata or {},
+            "status": "active"
+        }
+        db.get_collection(self.collection).insert_one(doc)
+        print(f"✓ Saved model: {version_id}")
+        return version_id
+    def load_model(self, version_id: str = None, model_name: str = None) -> tuple:
+        """
+        Load a model by version_id or latest version of model_name.
+        Returns: (model, metadata)
+        """
+        if version_id:
+            doc = db.get_collection(self.collection).find_one({"version_id": version_id})
+        elif model_name:
+            # Get latest version
+            doc = db.get_collection(self.collection).find_one(
+                {"model_name": model_name, "status": "active"},
+                sort=[("created_at", -1)]
+            )
+        else:
+            raise ValueError("Must provide version_id or model_name")
+        if not doc:
+            raise ValueError("Model not found")
+        # Load model file
+        model_path = Path(doc["file_path"])
+        # Verify integrity
+        current_hash = self._calculate_hash(model_path)
+        if current_hash != doc["file_hash"]:
+            raise ValueError("Model file corrupted (hash mismatch)")
+        with open(model_path, 'rb') as f:
+            model = pickle.load(f)
+        print(f"✓ Loaded model: {doc['version_id']}")
+        return model, doc["metadata"]
+    def save_prompt_template(self, template_name: str, prompt: str, metadata: Dict = None) -> str:
+        """
+        Save a prompt template with versioning.
+        """
+        version_id = self._generate_version_id(template_name)
+        # Save to file
+        template_path = self.storage_dir / f"{version_id}.txt"
+        with open(template_path, 'w', encoding='utf-8') as f:
+            f.write(prompt)
+        # Save metadata
+        doc = {
+            "version_id": version_id,
+            "template_name": template_name,
+            "file_path": str(template_path),
+            "created_at": datetime.utcnow(),
+            "metadata": metadata or {},
+            "status": "active"
+        }
+        db.get_collection("PromptTemplates").insert_one(doc)
+        print(f"✓ Saved prompt template: {version_id}")
+        return version_id
+    def load_prompt_template(self, version_id: str = None, template_name: str = None) -> str:
+        """
+        Load a prompt template.
+        """
+        if version_id:
+            doc = db.get_collection("PromptTemplates").find_one({"version_id": version_id})
+        elif template_name:
+            doc = db.get_collection("PromptTemplates").find_one(
+                {"template_name": template_name, "status": "active"},
+                sort=[("created_at", -1)]
+            )
+        else:
+            raise ValueError("Must provide version_id or template_name")
+        if not doc:
+            raise ValueError("Template not found")
+        with open(doc["file_path"], 'r', encoding='utf-8') as f:
+            prompt = f.read()
+        return prompt
+    def list_versions(self, model_name: str) -> list:
+        """
+        List all versions of a model.
+        """
+        versions = list(db.get_collection(self.collection).find(
+            {"model_name": model_name},
+            sort=[("created_at", -1)]
+        ))
+        return [{
+            "version_id": v["version_id"],
+            "created_at": v["created_at"],
+            "status": v["status"],
+            "metadata": v.get("metadata", {})
+        } for v in versions]
+    def archive_version(self, version_id: str):
+        """
+        Archive (deactivate) a model version.
+        """
+        db.get_collection(self.collection).update_one(
+            {"version_id": version_id},
+            {"$set": {"status": "archived"}}
+        )
+        print(f"✓ Archived model: {version_id}")
+# Global registry instance
+registry = ModelRegistry()

services/monitoring.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+Logging & Monitoring Module
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Track pipeline performance, errors, and model drift
+"""
+import logging
+from datetime import datetime
+from typing import Dict, Any, Optional
+import json
+from pathlib import Path
+import numpy as np
+from database import db
+# Configure logging
+LOG_DIR = Path("logs")
+LOG_DIR.mkdir(exist_ok=True)
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(LOG_DIR / 'pipeline.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+class PipelineMonitor:
+    """
+    Monitor AI pipeline performance and log metrics.
+    """
+    def __init__(self):
+        self.metrics_collection = "PipelineMetrics"
+    def log_segmentation_run(self, metrics: Dict[str, Any]):
+        """
+        Log segmentation pipeline metrics.
+        Metrics should include:
+        - n_users: Number of users processed
+        - n_segments: Number of segments created
+        - inertia: K-means inertia
+        - execution_time: Time in seconds
+        - outliers_removed: Count
+        """
+        logger.info(f"Segmentation Run: {metrics}")
+        # Save to MongoDB for trend analysis
+        doc = {
+            "pipeline": "segmentation",
+            "timestamp": datetime.utcnow(),
+            "metrics": metrics
+        }
+        db.get_collection(self.metrics_collection).insert_one(doc)
+    def log_sentiment_run(self, metrics: Dict[str, Any]):
+        """
+        Log sentiment analysis metrics.
+        Metrics should include:
+        - n_comments: Number of comments analyzed
+        - sentiment_distribution: {Positive: X, Negative: Y, Neutral: Z}
+        - avg_confidence: Average confidence score
+        - execution_time: Time in seconds
+        """
+        logger.info(f"Sentiment Analysis Run: {metrics}")
+        doc = {
+            "pipeline": "sentiment",
+            "timestamp": datetime.utcnow(),
+            "metrics": metrics
+        }
+        db.get_collection(self.metrics_collection).insert_one(doc)
+    def log_genai_run(self, task: str, metrics: Dict[str, Any]):
+        """
+        Log Generative AI metrics.
+        Metrics should include:
+        - n_generated: Number of items generated
+        - avg_generation_time: Average time per item
+        - total_time: Total execution time
+        """
+        logger.info(f"GenAI Run ({task}): {metrics}")
+        doc = {
+            "pipeline": "genai",
+            "task": task,
+            "timestamp": datetime.utcnow(),
+            "metrics": metrics
+        }
+        db.get_collection(self.metrics_collection).insert_one(doc)
+    def log_error(self, pipeline: str, error: Exception, context: Dict = None):
+        """
+        Log pipeline errors.
+        """
+        logger.error(f"Error in {pipeline}: {str(error)}", exc_info=True)
+        doc = {
+            "pipeline": pipeline,
+            "timestamp": datetime.utcnow(),
+            "error": str(error),
+            "error_type": type(error).__name__,
+            "context": context or {}
+        }
+        db.get_collection("PipelineErrors").insert_one(doc)
+    def detect_drift_segmentation(self, current_centroids: np.ndarray) -> Dict:
+        """
+        Detect drift in K-means clustering.
+        Compare current centroids with previous run.
+        """
+        # Fetch last run's centroids
+        last_metric = db.get_collection(self.metrics_collection).find_one(
+            {"pipeline": "segmentation"},
+            sort=[("timestamp", -1)]
+        )
+        if not last_metric or "centroids" not in last_metric["metrics"]:
+            logger.info("No previous centroids found for drift detection")
+            return {"drift_detected": False, "reason": "no_baseline"}
+        # Calculate drift as Euclidean distance between centroids
+        prev_centroids = np.array(last_metric["metrics"]["centroids"])
+        if prev_centroids.shape != current_centroids.shape:
+            return {"drift_detected": True, "reason": "shape_mismatch"}
+        # Calculate average distance
+        distances = np.linalg.norm(current_centroids - prev_centroids, axis=1)
+        avg_drift = float(np.mean(distances))
+        max_drift = float(np.max(distances))
+        # Threshold: if average drift > 0.5 std, flag as drift
+        drift_detected = avg_drift > 0.5
+        result = {
+            "drift_detected": drift_detected,
+            "avg_drift": avg_drift,
+            "max_drift": max_drift,
+            "threshold": 0.5
+        }
+        if drift_detected:
+            logger.warning(f"⚠️ Cluster drift detected: avg={avg_drift:.3f}, max={max_drift:.3f}")
+        return result
+    def detect_drift_sentiment(self, current_distribution: Dict[str, int]) -> Dict:
+        """
+        Detect drift in sentiment distribution.
+        """
+        # Fetch last run's distribution
+        last_metric = db.get_collection(self.metrics_collection).find_one(
+            {"pipeline": "sentiment"},
+            sort=[("timestamp", -1)]
+        )
+        if not last_metric:
+            return {"drift_detected": False, "reason": "no_baseline"}
+        prev_dist = last_metric["metrics"].get("sentiment_distribution", {})
+        # Calculate total counts
+        prev_total = sum(prev_dist.values())
+        curr_total = sum(current_distribution.values())
+        if prev_total == 0 or curr_total == 0:
+            return {"drift_detected": False, "reason": "insufficient_data"}
+        # Calculate percentage change for each sentiment
+        changes = {}
+        for label in ["Positive", "Negative", "Neutral"]:
+            prev_pct = prev_dist.get(label, 0) / prev_total
+            curr_pct = current_distribution.get(label, 0) / curr_total
+            changes[label] = abs(curr_pct - prev_pct)
+        # Drift if any sentiment changes > 10%
+        max_change = max(changes.values())
+        drift_detected = max_change > 0.1
+        result = {
+            "drift_detected": drift_detected,
+            "changes": changes,
+            "max_change": max_change,
+            "threshold": 0.1
+        }
+        if drift_detected:
+            logger.warning(f"⚠️ Sentiment drift detected: max_change={max_change:.1%}")
+        return result
+    def get_performance_summary(self, pipeline: str, days: int = 7) -> Dict:
+        """
+        Get performance summary for the last N days.
+        """
+        from datetime import timedelta
+        cutoff = datetime.utcnow() - timedelta(days=days)
+        metrics = list(db.get_collection(self.metrics_collection).find({
+            "pipeline": pipeline,
+            "timestamp": {"$gte": cutoff}
+        }).sort("timestamp", -1))
+        if not metrics:
+            return {"error": "No metrics found"}
+        # Aggregate
+        total_runs = len(metrics)
+        avg_time = np.mean([m["metrics"].get("execution_time", 0) for m in metrics])
+        return {
+            "pipeline": pipeline,
+            "period_days": days,
+            "total_runs": total_runs,
+            "avg_execution_time": avg_time,
+            "last_run": metrics[0]["timestamp"]
+        }
+# Global monitor instance
+monitor = PipelineMonitor()

services/preprocessing.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Data Preprocessing & Cleaning Module
+Author: AI Generated
+Created: 2025-11-24
+Purpose: Clean and preprocess data before AI processing
+"""
+import re
+from typing import List, Dict
+import numpy as np
+from pyvi import ViTokenizer
+from sklearn.preprocessing import StandardScaler
+class VietnameseTextCleaner:
+    """
+    Clean and preprocess Vietnamese text for NLP tasks.
+    """
+    # Vietnamese stopwords
+    STOP_WORDS = {
+        'và', 'của', 'có', 'là', 'được', 'này', 'cho', 'với', 'các',
+        'đã', 'trong', 'không', 'rất', 'một', 'để', 'những', 'cũng',
+        'về', 'từ', 'hay', 'bị', 'như', 'làm', 'đó', 'lại', 'sẽ',
+        'thì', 'nếu', 'khi', 'mà', 'hoặc', 'nên', 'trên', 'dưới'
+    }
+    def __init__(self):
+        self.tokenizer = ViTokenizer
+    def clean_text(self, text: str) -> str:
+        """
+        Clean Vietnamese text:
+        - Remove HTML tags
+        - Remove special characters
+        - Normalize whitespace
+        - Lowercase
+        """
+        if not text:
+            return ""
+        # Remove HTML tags
+        text = re.sub(r'<[^>]+>', '', text)
+        # Remove URLs
+        text = re.sub(r'http\S+|www\.\S+', '', text)
+        # Remove emails
+        text = re.sub(r'\S+@\S+', '', text)
+        # Remove special characters (keep Vietnamese)
+        text = re.sub(r'[^a-zA-ZàáảãạăắằẵặẳâấầẩẫậèéẻẽẹêếềểễệìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữựỳýỷỹỵđĐ\s]', ' ', text)
+        # Normalize whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Lowercase
+        text = text.lower()
+        return text
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize Vietnamese text using pyvi.
+        Returns list of words.
+        """
+        text = self.clean_text(text)
+        if not text:
+            return []
+        # Use pyvi for Vietnamese word segmentation
+        tokenized = self.tokenizer.tokenize(text)
+        words = tokenized.split()
+        return words
+    def remove_stopwords(self, words: List[str]) -> List[str]:
+        """
+        Remove Vietnamese stopwords.
+        """
+        return [w for w in words if w not in self.STOP_WORDS]
+    def preprocess_for_sentiment(self, text: str) -> str:
+        """
+        Preprocess text for PhoBERT sentiment analysis.
+        PhoBERT expects word-segmented text.
+        """
+        # Clean and tokenize
+        words = self.tokenize(text)
+        # Join back with spaces (word-segmented format)
+        return ' '.join(words)
+    def extract_keywords(self, text: str, top_n: int = 5) -> List[str]:
+        """
+        Extract keywords from text.
+        Simple TF approach without stopwords.
+        """
+        words = self.tokenize(text)
+        words = self.remove_stopwords(words)
+        # Count frequency
+        word_freq = {}
+        for word in words:
+            if len(word) > 2:  # Filter very short words
+                word_freq[word] = word_freq.get(word, 0) + 1
+        # Get top N
+        top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
+        return [word[0] for word in top_words]
+class DataCleaner:
+    """
+    Clean and validate user feature data for clustering.
+    """
+    def __init__(self):
+        self.scaler = StandardScaler()
+    def remove_outliers(self, data: np.ndarray, threshold: float = 3.0) -> tuple:
+        """
+        Remove outliers using Z-score method.
+        Returns: (cleaned_data, valid_indices)
+        """
+        # Calculate z-scores
+        z_scores = np.abs((data - data.mean(axis=0)) / data.std(axis=0))
+        # Find rows without extreme outliers
+        valid_indices = np.where(np.all(z_scores < threshold, axis=1))[0]
+        cleaned_data = data[valid_indices]
+        removed_count = len(data) - len(cleaned_data)
+        if removed_count > 0:
+            print(f"  ⚠ Removed {removed_count} outliers ({removed_count/len(data)*100:.1f}%)")
+        return cleaned_data, valid_indices
+    def handle_missing_values(self, data: np.ndarray) -> np.ndarray:
+        """
+        Handle missing values (NaN, inf) by replacing with median.
+        """
+        # Replace inf with NaN
+        data = np.where(np.isinf(data), np.nan, data)
+        # Replace NaN with column median
+        col_median = np.nanmedian(data, axis=0)
+        inds = np.where(np.isnan(data))
+        data[inds] = np.take(col_median, inds[1])
+        return data
+    def normalize_features(self, data: np.ndarray, fit: bool = True) -> np.ndarray:
+        """
+        Standardize features using StandardScaler.
+        Args:
+            data: Feature matrix
+            fit: If True, fit scaler. If False, use existing scaler.
+        """
+        if fit:
+            normalized = self.scaler.fit_transform(data)
+        else:
+            normalized = self.scaler.transform(data)
+        return normalized
+    def clean_user_features(self, feature_matrix: np.ndarray, remove_outliers: bool = True) -> tuple:
+        """
+        Complete cleaning pipeline for user features.
+        Returns: (cleaned_features, valid_indices)
+        """
+        print("🔄 Cleaning user feature data...")
+        # Step 1: Handle missing values
+        data = self.handle_missing_values(feature_matrix)
+        print(f"  ✓ Handled missing values")
+        # Step 2: Remove outliers (optional)
+        if remove_outliers:
+            data, valid_indices = self.remove_outliers(data)
+        else:
+            valid_indices = np.arange(len(data))
+        # Step 3: Normalize
+        data = self.normalize_features(data, fit=True)
+        print(f"  ✓ Normalized {data.shape[0]} samples")
+        return data, valid_indices

services/segmentation_service.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+Event-Centric Audience Segmentation Service
+Author: AI Generated
+Created: 2025-11-24 (Refactored for event-centric)
+Purpose: Cluster users per event and save to EventAudienceSegment
+"""
+import numpy as np
+from typing import List, Dict, Tuple
+from datetime import datetime
+from sklearn.cluster import KMeans
+from bson import ObjectId
+from database import db
+from config import settings
+from models.event_models import EventAudienceSegment, MarketingContent
+from services.data_aggregation import UserDataAggregator
+from services.preprocessing import DataCleaner
+from services.monitoring import monitor
+from services.model_registry import registry
+class SegmentationService:
+    """
+    Event-centric user segmentation via K-Means clustering.
+    """
+    def __init__(self, event_code: str, n_clusters: int = None):
+        """
+        Initialize segmentation for a specific event.
+        Args:
+            event_code: Event identifier
+            n_clusters: Number of segments (default: from settings)
+        """
+        self.event_code = event_code
+        self.n_clusters = n_clusters or settings.N_CLUSTERS
+        self.aggregator = UserDataAggregator(event_code)
+        self.data_cleaner = DataCleaner()
+        self.kmeans = None
+        self.scaler = None
+        self.feature_names = []
+    def prepare_feature_matrix(self, user_data: List[Dict]) -> Tuple[np.ndarray, List[str]]:
+        """
+        Convert aggregated user data into feature matrix.
+        Uses hybrid approach:
+        - Event-specific: ticket_count, spend, is_follower
+        - Global: RFM (user's overall power)
+        Returns: (feature_matrix, user_ids)
+        """
+        feature_matrix = []
+        user_ids = []
+        for user in user_data:
+            # Event-specific features
+            event_ticket_count = user.get('event_ticket_count', 0)
+            event_total_spend = user.get('event_total_spend', 0)
+            is_follower = user.get('is_follower', 0)
+            # Global RFM (user power)
+            global_recency = user.get('global_recency', 999999)
+            global_frequency = user.get('global_frequency', 0)
+            global_monetary = user.get('global_monetary', 0)
+            # Combine features
+            features = [
+                event_ticket_count,
+                event_total_spend,
+                is_follower,
+                global_recency,
+                global_frequency,
+                global_monetary
+            ]
+            feature_matrix.append(features)
+            user_ids.append(str(user['user_id']))
+        # Store feature names
+        self.feature_names = [
+            'event_tickets',
+            'event_spend',
+            'is_follower',
+            'global_recency',
+            'global_frequency',
+            'global_monetary'
+        ]
+        return np.array(feature_matrix), user_ids
+    def fit_clustering(self, feature_matrix: np.ndarray) -> Tuple[KMeans, List[int]]:
+        """
+        Fit K-Means with preprocessing.
+        """
+        # Clean and normalize
+        normalized_features, valid_indices = self.data_cleaner.clean_user_features(feature_matrix)
+        # Save scaler for later use
+        self.scaler = self.data_cleaner.scaler
+        print(f"🔄 Fitting K-Means with {self.n_clusters} clusters...")
+        self.kmeans = KMeans(
+            n_clusters=self.n_clusters,
+            random_state=settings.RANDOM_STATE,
+            n_init=10
+        )
+        self.kmeans.fit(normalized_features)
+        print(f"✓ Clustering complete. Inertia: {self.kmeans.inertia_:.2f}")
+        return self.kmeans, valid_indices
+    def interpret_cluster(self, cluster_id: int) -> Dict:
+        """
+        Interpret cluster characteristics.
+        """
+        centroid = self.kmeans.cluster_centers_[cluster_id]
+        centroid_original = self.scaler.inverse_transform([centroid])[0]
+        interpretation = {}
+        for i, feature_name in enumerate(self.feature_names):
+            interpretation[feature_name] = float(centroid_original[i])
+        # Generate segment name
+        event_spend = interpretation.get('event_spend', 0)
+        event_tickets = interpretation.get('event_tickets', 0)
+        global_monetary = interpretation.get('global_monetary', 0)
+        is_follower = interpretation.get('is_follower', 0)
+        segment_name = self._generate_segment_name(
+            event_spend, event_tickets, global_monetary, is_follower
+        )
+        return {
+            "segment_name": segment_name,
+            "criteria": interpretation,
+            "cluster_id": cluster_id
+        }
+    def _generate_segment_name(
+        self,
+        event_spend: float,
+        event_tickets: float,
+        global_monetary: float,
+        is_follower: float
+    ) -> str:
+        """Generate Vietnamese segment name."""
+        # High spenders on this event
+        if event_spend > 500000 and event_tickets > 2:
+            return "Khách Hàng VIP Sự Kiện"
+        # Bought tickets but moderate spend
+        elif event_tickets > 0 and event_spend > 100000:
+            return "Khách Hàng Tích Cực"
+        # Only followers, no tickets yet
+        elif is_follower > 0.5 and event_tickets == 0:
+            return "Người Theo Dõi Tiềm Năng"
+        # High global value but low event engagement
+        elif global_monetary > 1000000 and event_spend < 100000:
+            return "Khách Hàng Chưa Khai Phá"
+        # Low event engagement
+        else:
+            return "Khách Hàng Ít Tương Tác"
+    def save_segments_to_db(
+        self,
+        cluster_interpretations: List[Dict],
+        user_ids: List[str],
+        labels: np.ndarray
+    ) -> List[ObjectId]:
+        """
+        Save to EventAudienceSegment collection.
+        """
+        print("🔄 Saving event segments to database...")
+        segment_ids = []
+        for cluster_info in cluster_interpretations:
+            cluster_id = cluster_info['cluster_id']
+            # Get user_ids in this cluster
+            cluster_user_indices = np.where(labels == cluster_id)[0]
+            cluster_user_ids = [ObjectId(user_ids[i]) for i in cluster_user_indices]
+            segment = EventAudienceSegment(
+                event_code=self.event_code,
+                segment_name=cluster_info['segment_name'],
+                segment_type=cluster_info['segment_name'],  # Can categorize further
+                user_count=len(cluster_user_ids),
+                user_ids=cluster_user_ids,
+                criteria=cluster_info['criteria'],
+                marketing_content=None,  # Will be generated by GenAI
+                created_at=datetime.utcnow(),
+                last_updated=datetime.utcnow()
+            )
+            result = db.event_audience_segments.insert_one(
+                segment.dict(by_alias=True, exclude={'id'})
+            )
+            segment_ids.append(result.inserted_id)
+            print(f"  ✓ '{segment.segment_name}': {len(cluster_user_ids)} users")
+        return segment_ids
+    def run_segmentation(self) -> List[ObjectId]:
+        """
+        Execute event-centric segmentation pipeline.
+        """
+        import time
+        start_time = time.time()
+        print("=" * 60)
+        print(f"🚀 Segmenting Event: {self.event_code}")
+        print("=" * 60)
+        try:
+            # Step 1: Aggregate event users
+            user_data = self.aggregator.aggregate_user_features()
+            if len(user_data) < self.n_clusters:
+                print(f"⚠ Not enough users ({len(user_data)}) for {self.n_clusters} clusters")
+                return []
+            # Step 2: Prepare features
+            feature_matrix, user_ids = self.prepare_feature_matrix(user_data)
+            print(f"✓ Feature matrix: {feature_matrix.shape}")
+            # Step 3: Clustering
+            self.kmeans, valid_indices = self.fit_clustering(feature_matrix)
+            user_ids = [user_ids[i] for i in valid_indices]
+            # Step 4: Get labels
+            normalized_features = self.scaler.transform(feature_matrix[valid_indices])
+            labels = self.kmeans.labels_
+            # Step 5: Interpret clusters
+            cluster_interpretations = [
+                self.interpret_cluster(i) for i in range(self.n_clusters)
+            ]
+            # Step 6: Save to EventAudienceSegment
+            segment_ids = self.save_segments_to_db(
+                cluster_interpretations,
+                user_ids,
+                labels
+            )
+            # Step 7: Save model
+            metadata = {
+                "event_code": self.event_code,
+                "n_clusters": self.n_clusters,
+                "n_users": len(user_ids),
+                "inertia": float(self.kmeans.inertia_)
+            }
+            registry.save_model(
+                self.kmeans,
+                f"kmeans_{self.event_code}",
+                metadata
+            )
+            # Step 8: Monitoring
+            execution_time = time.time() - start_time
+            metrics = {
+                "event_code": self.event_code,
+                "n_users": len(user_ids),
+                "n_segments": self.n_clusters,
+                "inertia": float(self.kmeans.inertia_),
+                "execution_time": execution_time,
+                "centroids": self.kmeans.cluster_centers_.tolist()
+            }
+            monitor.log_segmentation_run(metrics)
+            print("=" * 60)
+            print("✅ Segmentation Complete!")
+            print(f"⏱️  Time: {execution_time:.2f}s")
+            print("=" * 60)
+            return segment_ids
+        except Exception as e:
+            monitor.log_error("segmentation", e, {
+                "event_code": self.event_code,
+                "n_clusters": self.n_clusters
+            })
+            raise

services/sentiment_service.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Event-Centric Sentiment Analysis Service
+Author: AI Generated
+Created: 2025-11-24 (Refactored)
+Purpose: Analyze sentiment for event comments and generate summary
+"""
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from typing import Tuple, List, Dict
+from datetime import datetime
+from bson import ObjectId
+from database import db
+from config import settings
+from models.sentiment_models import SentimentAnalysisResult
+from models.event_models import EventSentimentSummary, AIInsights
+from services.preprocessing import VietnameseTextCleaner
+from services.monitoring import monitor
+class SentimentAnalysisService:
+    """
+    Event-centric sentiment analysis using PhoBERT.
+    """
+    def __init__(self, event_code: str):
+        """
+        Initialize for a specific event.
+        Args:
+            event_code: Event identifier
+        """
+        self.event_code = event_code
+        self.model_name = settings.SENTIMENT_MODEL
+        self.tokenizer = None
+        self.model = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.label_map = {0: "Negative", 1: "Positive", 2: "Neutral"}
+        self.text_cleaner = VietnameseTextCleaner()
+    def load_model(self):
+        """Load PhoBERT model"""
+        print(f"🔄 Loading sentiment model: {self.model_name}")
+        token = settings.HF_TOKEN if settings.HF_TOKEN else None
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            token=token
+        )
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name,
+            token=token
+        )
+        self.model.to(self.device)
+        self.model.eval()
+        print(f"✓ Model loaded on {self.device}")
+    def analyze_text(self, text: str) -> Tuple[str, float]:
+        """Analyze single text with preprocessing."""
+        if not self.model:
+            self.load_model()
+        # Preprocess
+        preprocessed = self.text_cleaner.preprocess_for_sentiment(text)
+        if not preprocessed:
+            return "Neutral", 0.5
+        # Tokenize
+        inputs = self.tokenizer(
+            preprocessed,
+            return_tensors="pt",
+            truncation=True,
+            max_length=256,
+            padding=True
+        ).to(self.device)
+        # Predict
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+            probs = torch.softmax(logits, dim=-1)
+            predicted_class = torch.argmax(probs, dim=-1).item()
+            confidence = probs[0][predicted_class].item()
+        sentiment_label = self.label_map.get(predicted_class, "Neutral")
+        return sentiment_label, confidence
+    def analyze_event_comments(self) -> Dict:
+        """
+        Analyze all comments for this event.
+        Returns summary statistics.
+        """
+        import time
+        start_time = time.time()
+        print("=" * 60)
+        print(f"🚀 Analyzing Sentiment for Event: {self.event_code}")
+        print("=" * 60)
+        try:
+            if not self.model:
+                self.load_model()
+            # Fetch comments for THIS EVENT only
+            comments = list(db.user_comment_post.find({
+                "EventCode": self.event_code
+            }).limit(1000))
+            print(f"✓ Found {len(comments)} comments for this event")
+            if not comments:
+                print("⚠ No comments to analyze")
+                return {}
+            # Analyze each
+            results_to_save = []
+            sentiment_counts = {"Positive": 0, "Negative": 0, "Neutral": 0}
+            total_confidence = 0
+            all_keywords = []
+            for comment in comments:
+                text = comment.get('CommentText', '')
+                if not text:
+                    continue
+                sentiment, confidence = self.analyze_text(text)
+                keywords = self.text_cleaner.extract_keywords(text, top_n=3)
+                # Save individual result
+                result = SentimentAnalysisResult(
+                    source_id=comment['_id'],
+                    source_type="UserCommentPost",
+                    event_code=self.event_code,  # NEW: link to event
+                    sentiment_label=sentiment,
+                    confidence_score=confidence,
+                    key_phrases=keywords,
+                    analyzed_at=datetime.utcnow()
+                )
+                results_to_save.append(result.dict(by_alias=True, exclude={'id'}))
+                # Update counts
+                sentiment_counts[sentiment] += 1
+                total_confidence += confidence
+                all_keywords.extend(keywords)
+            # Bulk insert results
+            if results_to_save:
+                db.sentiment_results.insert_many(results_to_save)
+                print(f"✓ Saved {len(results_to_save)} sentiment results")
+            # Calculate summary
+            avg_confidence = total_confidence / len(results_to_save) if results_to_save else 0
+            # Get top keywords
+            keyword_freq = {}
+            for kw in all_keywords:
+                keyword_freq[kw] = keyword_freq.get(kw, 0) + 1
+            top_keywords = sorted(
+                keyword_freq.items(),
+                key=lambda x: x[1],
+                reverse=True
+            )[:10]
+            top_keywords = [kw[0] for kw in top_keywords]
+            # Save summary
+            summary = EventSentimentSummary(
+                event_code=self.event_code,
+                total_comments=len(results_to_save),
+                sentiment_distribution=sentiment_counts,
+                avg_confidence=avg_confidence,
+                top_keywords=top_keywords,
+                ai_insights=None,  # Will be filled by GenAI
+                last_updated=datetime.utcnow()
+            )
+            db.event_sentiment_summary.update_one(
+                {"event_code": self.event_code},
+                {"$set": summary.dict(by_alias=True, exclude={'id'})},
+                upsert=True
+            )
+            # Print summary
+            print("\n📊 Sentiment Distribution:")
+            for label, count in sentiment_counts.items():
+                pct = (count / len(results_to_save) *100) if results_to_save else 0
+                print(f"  {label}: {count} ({pct:.1f}%)")
+            # Log metrics
+            execution_time = time.time() - start_time
+            metrics = {
+                "event_code": self.event_code,
+                "n_comments": len(results_to_save),
+                "sentiment_distribution": sentiment_counts,
+                "avg_confidence": avg_confidence,
+                "execution_time": execution_time
+            }
+            monitor.log_sentiment_run(metrics)
+            print("=" * 60)
+            print("✅ Sentiment Analysis Complete!")
+            print(f"⏱️  Time: {execution_time:.2f}s")
+            print("=" * 60)
+            return {
+                "total_comments": len(results_to_save),
+                "sentiment_distribution": sentiment_counts,
+                "avg_confidence": avg_confidence,
+                "top_keywords": top_keywords
+            }
+        except Exception as e:
+            monitor.log_error("sentiment", e, {
+                "event_code": self.event_code,
+                "model": self.model_name
+            })
+            raise