dss/tools/api/ingestion_parser.py

"""
Natural Language Parser for Design System Ingestion.

This module parses natural language prompts to understand:
- Intent (ingest, search, compare, etc.)
- Design system names
- Alternative sources (Figma URLs, images, etc.)
- Configuration options
"""

import re
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any, Tuple
from enum import Enum

from design_system_registry import (
    find_design_system,
    search_design_systems,
    get_alternative_ingestion_options,
    DesignSystemInfo,
)


class IngestionIntent(Enum):
    """Types of user intents for design system operations."""
    INGEST = "ingest"           # Add/import a design system
    SEARCH = "search"           # Search for design systems
    LIST = "list"               # List available/known systems
    INFO = "info"               # Get info about a specific system
    COMPARE = "compare"         # Compare design systems
    CONFIGURE = "configure"     # Configure ingestion settings
    HELP = "help"               # Help with ingestion
    UNKNOWN = "unknown"


class SourceType(Enum):
    """Types of sources detected in prompts."""
    DESIGN_SYSTEM_NAME = "design_system_name"
    NPM_PACKAGE = "npm_package"
    FIGMA_URL = "figma_url"
    GITHUB_URL = "github_url"
    CSS_URL = "css_url"
    IMAGE_URL = "image_url"
    TEXT_DESCRIPTION = "text_description"


@dataclass
class ParsedSource:
    """A detected source from the prompt."""
    source_type: SourceType
    value: str
    confidence: float = 1.0  # 0.0 to 1.0
    matched_system: Optional[DesignSystemInfo] = None


@dataclass
class ParsedIngestionPrompt:
    """Result of parsing an ingestion prompt."""
    original_prompt: str
    intent: IngestionIntent
    confidence: float = 1.0
    sources: List[ParsedSource] = field(default_factory=list)
    options: Dict[str, Any] = field(default_factory=dict)
    suggestions: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for API responses."""
        return {
            "original_prompt": self.original_prompt,
            "intent": self.intent.value,
            "confidence": self.confidence,
            "sources": [
                {
                    "type": s.source_type.value,
                    "value": s.value,
                    "confidence": s.confidence,
                    "matched_system": s.matched_system.to_dict() if s.matched_system else None
                }
                for s in self.sources
            ],
            "options": self.options,
            "suggestions": self.suggestions,
        }


# Intent detection patterns
INTENT_PATTERNS = {
    IngestionIntent.INGEST: [
        r'\b(ingest|import|add|use|install|load|get|fetch|download|setup|init|initialize)\b',
        r'\b(i want|i need|give me|let\'s use|can you add|please add)\b',
        r'\b(integrate|incorporate|bring in|pull in)\b',
    ],
    IngestionIntent.SEARCH: [
        r'\b(search|find|look for|looking for|discover|explore)\b',
        r'\b(what.*available|show me.*options|any.*like)\b',
    ],
    IngestionIntent.LIST: [
        r'\b(list|show|display|what|which)\b.*(design systems?|available|supported|known)\b',
        r'\b(what do you (know|have|support))\b',
    ],
    IngestionIntent.INFO: [
        r'\b(info|information|details|about|tell me about|what is)\b',
        r'\b(how does|what\'s|describe)\b',
    ],
    IngestionIntent.COMPARE: [
        r'\b(compare|versus|vs|difference|between|or)\b.*\b(and|vs|versus|or)\b',
    ],
    IngestionIntent.CONFIGURE: [
        r'\b(configure|config|settings?|options?|customize)\b',
    ],
    IngestionIntent.HELP: [
        r'\b(help|how to|how do i|what can|guide|tutorial)\b',
    ],
}

# URL patterns
URL_PATTERNS = {
    SourceType.FIGMA_URL: r'(https?://(?:www\.)?figma\.com/(?:file|design|community/file)/[^\s]+)',
    SourceType.GITHUB_URL: r'(https?://(?:www\.)?github\.com/[^\s]+)',
    SourceType.NPM_PACKAGE: r'(?:npm:)?(@?[a-z0-9][\w\-\.]*(?:/[a-z0-9][\w\-\.]*)?)',
    SourceType.CSS_URL: r'(https?://[^\s]+\.(?:css|scss|sass)(?:\?[^\s]*)?)',
    SourceType.IMAGE_URL: r'(https?://[^\s]+\.(?:png|jpg|jpeg|gif|webp|svg)(?:\?[^\s]*)?)',
}


def detect_intent(prompt: str) -> Tuple[IngestionIntent, float]:
    """
    Detect the user's intent from their prompt.
    Returns (intent, confidence).
    """
    prompt_lower = prompt.lower()

    # Score each intent
    intent_scores = {}
    for intent, patterns in INTENT_PATTERNS.items():
        score = 0
        for pattern in patterns:
            matches = re.findall(pattern, prompt_lower)
            score += len(matches)
        intent_scores[intent] = score

    # Find best match
    if not any(intent_scores.values()):
        # Default to INGEST if prompt contains a design system name
        return IngestionIntent.INGEST, 0.5

    best_intent = max(intent_scores, key=intent_scores.get)
    max_score = intent_scores[best_intent]

    # Calculate confidence based on match strength
    confidence = min(1.0, max_score * 0.3 + 0.4)

    return best_intent, confidence


def extract_urls(prompt: str) -> List[ParsedSource]:
    """Extract URLs from the prompt."""
    sources = []

    for source_type, pattern in URL_PATTERNS.items():
        if source_type == SourceType.NPM_PACKAGE:
            continue  # Handle separately

        matches = re.findall(pattern, prompt, re.IGNORECASE)
        for match in matches:
            sources.append(ParsedSource(
                source_type=source_type,
                value=match,
                confidence=0.95
            ))

    return sources


def extract_design_systems(prompt: str) -> List[ParsedSource]:
    """
    Extract design system names from the prompt.
    Uses the registry to match known systems.
    """
    sources = []

    # Remove URLs first to avoid false positives
    cleaned_prompt = re.sub(r'https?://[^\s]+', '', prompt)

    # Remove common noise words
    noise_words = ['the', 'a', 'an', 'from', 'to', 'with', 'for', 'and', 'or', 'in', 'on', 'at']
    words = cleaned_prompt.lower().split()

    # Try different n-grams (1-3 words)
    for n in range(3, 0, -1):
        for i in range(len(words) - n + 1):
            phrase = ' '.join(words[i:i+n])

            # Skip if mostly noise words
            if all(w in noise_words for w in phrase.split()):
                continue

            # Try to find matching design system
            system = find_design_system(phrase)
            if system:
                # Check if we already found this system
                if not any(s.matched_system and s.matched_system.id == system.id for s in sources):
                    sources.append(ParsedSource(
                        source_type=SourceType.DESIGN_SYSTEM_NAME,
                        value=phrase,
                        confidence=0.9 if n > 1 else 0.7,
                        matched_system=system
                    ))

    return sources


def extract_npm_packages(prompt: str) -> List[ParsedSource]:
    """Extract explicit npm package references."""
    sources = []

    # Match @scope/package or package-name patterns
    # Only if they look like npm packages (not URLs or common words)
    npm_pattern = r'(?:npm[:\s]+)?(@[a-z0-9][\w\-\.]+/[\w\-\.]+|[a-z][\w\-\.]*(?:/[\w\-\.]+)?)'

    matches = re.findall(npm_pattern, prompt.lower())
    for match in matches:
        # Filter out common words that might match
        if match in ['design', 'system', 'use', 'the', 'and', 'for', 'from']:
            continue

        # Check if it looks like an npm package (has @, /, or -)
        if '@' in match or '/' in match or '-' in match:
            sources.append(ParsedSource(
                source_type=SourceType.NPM_PACKAGE,
                value=match,
                confidence=0.8
            ))

    return sources


def generate_suggestions(parsed: ParsedIngestionPrompt) -> List[str]:
    """Generate helpful suggestions based on parsed prompt."""
    suggestions = []

    if parsed.intent == IngestionIntent.INGEST:
        if not parsed.sources:
            suggestions.append("No design system detected. Try specifying a name like 'heroui', 'shadcn', or 'mui'")
            suggestions.append("You can also provide a Figma URL, npm package, or GitHub repository")
        else:
            for source in parsed.sources:
                if source.matched_system:
                    system = source.matched_system
                    suggestions.append(f"Found '{system.name}' - {system.description}")
                    if system.npm_packages:
                        suggestions.append(f"Will install: {', '.join(system.npm_packages)}")
                    if system.figma_community_url:
                        suggestions.append(f"Figma kit available: {system.figma_community_url}")

    elif parsed.intent == IngestionIntent.SEARCH:
        suggestions.append("I can search npm registry for design systems")
        suggestions.append("Try being more specific, like 'search for material design components'")

    elif parsed.intent == IngestionIntent.HELP:
        suggestions.append("I can ingest design systems from: npm packages, Figma, GitHub, CSS files, or images")
        suggestions.append("Try: 'add heroui' or 'ingest from figma.com/file/...'")

    return suggestions


def parse_ingestion_prompt(prompt: str) -> ParsedIngestionPrompt:
    """
    Parse a natural language prompt for design system ingestion.

    Examples:
        "add heroui" -> Detects HeroUI design system
        "ingest material ui for our project" -> Detects MUI
        "import from figma.com/file/abc123" -> Extracts Figma URL
        "use @chakra-ui/react" -> Detects npm package
        "what design systems do you support?" -> LIST intent
    """
    # Detect intent
    intent, intent_confidence = detect_intent(prompt)

    # Initialize result
    result = ParsedIngestionPrompt(
        original_prompt=prompt,
        intent=intent,
        confidence=intent_confidence,
    )

    # Extract sources
    result.sources.extend(extract_urls(prompt))
    result.sources.extend(extract_design_systems(prompt))
    result.sources.extend(extract_npm_packages(prompt))

    # Remove duplicates (prefer higher confidence)
    seen_values = {}
    unique_sources = []
    for source in sorted(result.sources, key=lambda s: s.confidence, reverse=True):
        key = (source.source_type, source.value.lower())
        if key not in seen_values:
            seen_values[key] = True
            unique_sources.append(source)
    result.sources = unique_sources

    # Generate suggestions
    result.suggestions = generate_suggestions(result)

    # Adjust confidence based on source quality
    if result.sources:
        max_source_confidence = max(s.confidence for s in result.sources)
        result.confidence = (intent_confidence + max_source_confidence) / 2

    return result


def parse_and_suggest(prompt: str) -> Dict[str, Any]:
    """
    Parse a prompt and provide suggestions for next steps.
    This is the main entry point for the ingestion parser.
    """
    parsed = parse_ingestion_prompt(prompt)

    response = parsed.to_dict()

    # Add next steps based on what was found
    next_steps = []

    if parsed.intent == IngestionIntent.INGEST:
        if parsed.sources:
            # Found something to ingest
            for source in parsed.sources:
                if source.source_type == SourceType.DESIGN_SYSTEM_NAME and source.matched_system:
                    system = source.matched_system
                    next_steps.append({
                        "action": "confirm_ingestion",
                        "system": system.to_dict(),
                        "message": f"Ready to ingest '{system.name}'. Confirm to proceed?"
                    })
                elif source.source_type == SourceType.FIGMA_URL:
                    next_steps.append({
                        "action": "ingest_figma",
                        "url": source.value,
                        "message": "Figma URL detected. Ready to extract design tokens?"
                    })
                elif source.source_type == SourceType.NPM_PACKAGE:
                    next_steps.append({
                        "action": "search_npm",
                        "package": source.value,
                        "message": f"Will search npm for '{source.value}'"
                    })
        else:
            # Nothing found - offer alternatives
            alternatives = get_alternative_ingestion_options()
            next_steps.append({
                "action": "request_source",
                "alternatives": alternatives["alternatives"],
                "message": "No design system detected. Please provide more details:"
            })

    elif parsed.intent == IngestionIntent.SEARCH:
        # Extract search terms
        search_terms = re.sub(r'\b(search|find|look for)\b', '', prompt.lower()).strip()
        if search_terms:
            matches = search_design_systems(search_terms)
            if matches:
                next_steps.append({
                    "action": "show_search_results",
                    "results": [m.to_dict() for m in matches],
                    "message": f"Found {len(matches)} matching design systems"
                })
            else:
                next_steps.append({
                    "action": "search_npm",
                    "query": search_terms,
                    "message": f"No built-in match. Will search npm for '{search_terms}'"
                })

    elif parsed.intent == IngestionIntent.LIST:
        from design_system_registry import get_all_systems
        all_systems = get_all_systems()
        next_steps.append({
            "action": "show_all_systems",
            "count": len(all_systems),
            "categories": list(set(s.category for s in all_systems)),
            "message": f"I know about {len(all_systems)} design systems"
        })

    elif parsed.intent == IngestionIntent.INFO:
        for source in parsed.sources:
            if source.matched_system:
                system = source.matched_system
                alternatives = get_alternative_ingestion_options(system)
                next_steps.append({
                    "action": "show_info",
                    "system": system.to_dict(),
                    "alternatives": alternatives,
                    "message": f"Information about {system.name}"
                })

    response["next_steps"] = next_steps
    return response


# Convenience function for quick parsing
def quick_parse(prompt: str) -> Tuple[Optional[DesignSystemInfo], IngestionIntent, float]:
    """
    Quick parse that returns the most likely design system and intent.
    Useful for simple lookups.
    """
    parsed = parse_ingestion_prompt(prompt)

    # Find the best design system match
    best_system = None
    for source in parsed.sources:
        if source.matched_system:
            best_system = source.matched_system
            break

    return best_system, parsed.intent, parsed.confidence