dss/demo/tools/ingest/css.py

"""
CSS Token Source

Extracts design tokens from CSS custom properties (CSS variables).
Parses :root declarations and other CSS variable definitions.
"""

import re
from pathlib import Path
from typing import List, Optional, Tuple
from .base import DesignToken, TokenCollection, TokenSource, TokenType, TokenCategory


class CSSTokenSource(TokenSource):
    """
    Extract tokens from CSS files.

    Parses CSS custom properties defined in :root or other selectors.
    Supports:
    - :root { --color-primary: #3B82F6; }
    - [data-theme="dark"] { --color-primary: #60A5FA; }
    - Comments as descriptions
    """

    @property
    def source_type(self) -> str:
        return "css"

    async def extract(self, source: str) -> TokenCollection:
        """
        Extract tokens from CSS file or content.

        Args:
            source: File path or CSS content string

        Returns:
            TokenCollection with extracted tokens
        """
        # Determine if source is file path or content
        if self._is_file_path(source):
            file_path = Path(source)
            if not file_path.exists():
                raise FileNotFoundError(f"CSS file not found: {source}")
            content = file_path.read_text(encoding="utf-8")
            source_file = str(file_path.absolute())
        else:
            content = source
            source_file = "<inline>"

        tokens = self._parse_css(content, source_file)

        return TokenCollection(
            tokens=tokens,
            name=f"CSS Tokens from {Path(source_file).name if source_file != '<inline>' else 'inline'}",
            sources=[self._create_source_id(source_file)],
        )

    def _is_file_path(self, source: str) -> bool:
        """Check if source looks like a file path."""
        # If it contains CSS syntax, it's content
        if '{' in source or ':' in source and ';' in source:
            return False
        # If it ends with .css, it's a file
        if source.endswith('.css'):
            return True
        # If path exists, it's a file
        return Path(source).exists()

    def _parse_css(self, content: str, source_file: str) -> List[DesignToken]:
        """Parse CSS content and extract custom properties."""
        tokens = []

        # Track line numbers
        lines = content.split('\n')
        line_map = self._build_line_map(content)

        # Find all CSS variable declarations
        # Pattern matches: --var-name: value;
        var_pattern = re.compile(
            r'(\/\*[^*]*\*\/\s*)?'  # Optional preceding comment
            r'(--[\w-]+)\s*:\s*'    # Variable name
            r'([^;]+);',            # Value
            re.MULTILINE
        )

        # Find variables in all rule blocks
        for match in var_pattern.finditer(content):
            comment = match.group(1)
            var_name = match.group(2)
            var_value = match.group(3).strip()

            # Get line number
            pos = match.start()
            line_num = self._get_line_number(pos, line_map)

            # Extract description from comment
            description = ""
            if comment:
                description = self._clean_comment(comment)

            # Get context (selector)
            context = self._get_selector_context(content, pos)

            # Create token
            token = DesignToken(
                name=self._normalize_var_name(var_name),
                value=var_value,
                description=description,
                source=self._create_source_id(source_file, line_num),
                source_file=source_file,
                source_line=line_num,
                original_name=var_name,
                original_value=var_value,
            )

            # Add context as tag if not :root
            if context and context != ":root":
                token.tags.append(f"context:{context}")

            tokens.append(token)

        return tokens

    def _build_line_map(self, content: str) -> List[int]:
        """Build map of character positions to line numbers."""
        line_map = []
        pos = 0
        for i, line in enumerate(content.split('\n'), 1):
            line_map.append(pos)
            pos += len(line) + 1  # +1 for newline
        return line_map

    def _get_line_number(self, pos: int, line_map: List[int]) -> int:
        """Get line number for character position."""
        for i, line_start in enumerate(line_map):
            if i + 1 < len(line_map):
                if line_start <= pos < line_map[i + 1]:
                    return i + 1
            else:
                return i + 1
        return 1

    def _normalize_var_name(self, var_name: str) -> str:
        """Convert CSS variable name to token name."""
        # Remove -- prefix
        name = var_name.lstrip('-')
        # Convert kebab-case to dot notation
        name = name.replace('-', '.')
        return name

    def _clean_comment(self, comment: str) -> str:
        """Extract text from CSS comment."""
        if not comment:
            return ""
        # Remove /* and */
        text = re.sub(r'/\*|\*/', '', comment)
        # Clean whitespace
        text = ' '.join(text.split())
        return text.strip()

    def _get_selector_context(self, content: str, pos: int) -> str:
        """Get the CSS selector context for a variable."""
        # Find the opening brace before this position
        before = content[:pos]
        last_open = before.rfind('{')
        if last_open == -1:
            return ""

        # Find the selector before the brace
        selector_part = before[:last_open]
        # Get last selector (after } or start)
        last_close = selector_part.rfind('}')
        if last_close != -1:
            selector_part = selector_part[last_close + 1:]

        # Clean up
        selector = selector_part.strip()
        # Handle multi-line selectors
        selector = ' '.join(selector.split())
        return selector


class CSSInlineExtractor:
    """
    Extract inline styles from HTML/JSX for token candidate identification.

    Finds style="" attributes and extracts values that could become tokens.
    """

    # Patterns for extracting inline styles
    STYLE_ATTR_PATTERN = re.compile(
        r'style\s*=\s*["\']([^"\']+)["\']',
        re.IGNORECASE
    )

    # JSX style object pattern
    JSX_STYLE_PATTERN = re.compile(
        r'style\s*=\s*\{\{([^}]+)\}\}',
        re.MULTILINE
    )

    async def extract_candidates(self, source: str) -> List[Tuple[str, str, int]]:
        """
        Extract inline style values as token candidates.

        Returns list of (property, value, line_number) tuples.
        """
        candidates = []

        # Determine if file or content
        if Path(source).exists():
            content = Path(source).read_text(encoding="utf-8")
        else:
            content = source

        lines = content.split('\n')

        for i, line in enumerate(lines, 1):
            # Check HTML style attribute
            for match in self.STYLE_ATTR_PATTERN.finditer(line):
                style_content = match.group(1)
                for prop, value in self._parse_style_string(style_content):
                    if self._is_token_candidate(value):
                        candidates.append((prop, value, i))

            # Check JSX style object
            for match in self.JSX_STYLE_PATTERN.finditer(line):
                style_content = match.group(1)
                for prop, value in self._parse_jsx_style(style_content):
                    if self._is_token_candidate(value):
                        candidates.append((prop, value, i))

        return candidates

    def _parse_style_string(self, style: str) -> List[Tuple[str, str]]:
        """Parse CSS style string into property-value pairs."""
        pairs = []
        for declaration in style.split(';'):
            if ':' in declaration:
                prop, value = declaration.split(':', 1)
                pairs.append((prop.strip(), value.strip()))
        return pairs

    def _parse_jsx_style(self, style: str) -> List[Tuple[str, str]]:
        """Parse JSX style object into property-value pairs."""
        pairs = []
        # Simple parsing for common cases
        for part in style.split(','):
            if ':' in part:
                prop, value = part.split(':', 1)
                prop = prop.strip().strip('"\'')
                value = value.strip().strip('"\'')
                # Convert camelCase to kebab-case
                prop = re.sub(r'([a-z])([A-Z])', r'\1-\2', prop).lower()
                pairs.append((prop, value))
        return pairs

    def _is_token_candidate(self, value: str) -> bool:
        """Check if value should be extracted as a token."""
        value = value.strip().lower()

        # Colors are always candidates
        if re.match(r'^#[0-9a-f]{3,8}$', value):
            return True
        if re.match(r'^rgb[a]?\s*\(', value):
            return True
        if re.match(r'^hsl[a]?\s*\(', value):
            return True

        # Dimensions with common units
        if re.match(r'^\d+(\.\d+)?(px|rem|em|%)$', value):
            return True

        # Skip variable references
        if value.startswith('var('):
            return False

        # Skip inherit/initial/etc
        if value in ('inherit', 'initial', 'unset', 'auto', 'none'):
            return False

        return False