Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions
--- a/modules/taddy_client.py
+++ b/modules/taddy_client.py
@@ -0,0 +1,391 @@
+"""Taddy Podcast API client for finding podcast appearances"""
+import asyncio
+import re
+from html import unescape
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+from web.backend.core.http_client import http_client
+from modules.universal_logger import get_logger
+
+logger = get_logger('Taddy')
+
+
+def strip_html(text: str) -> str:
+    """Strip HTML tags and decode entities from text"""
+    if not text:
+        return text
+    # Remove HTML tags
+    clean = re.sub(r'<[^>]+>', ' ', text)
+    # Decode HTML entities
+    clean = unescape(clean)
+    # Normalize whitespace
+    clean = re.sub(r'\s+', ' ', clean).strip()
+    return clean
+
+
+class TaddyClient:
+    """Client for interacting with the Taddy Podcast API (GraphQL)
+
+    Supports primary and fallback accounts for quota management.
+    When the primary account fails (500 error / quota exceeded),
+    automatically switches to the fallback account.
+    """
+
+    BASE_URL = "https://api.taddy.org"
+
+    def __init__(self, user_id: str, api_key: str,
+                 user_id_2: str = None, api_key_2: str = None):
+        # Primary account
+        self.user_id = user_id
+        self.api_key = api_key
+
+        # Fallback account (optional)
+        self.user_id_2 = user_id_2
+        self.api_key_2 = api_key_2
+        self.has_fallback = bool(user_id_2 and api_key_2)
+
+        # Track which account is active
+        self.using_fallback = False
+
+        self._update_headers()
+
+    def _update_headers(self):
+        """Update headers based on current active account"""
+        if self.using_fallback and self.has_fallback:
+            self.headers = {
+                "Content-Type": "application/json",
+                "X-USER-ID": self.user_id_2,
+                "X-API-KEY": self.api_key_2
+            }
+        else:
+            self.headers = {
+                "Content-Type": "application/json",
+                "X-USER-ID": self.user_id,
+                "X-API-KEY": self.api_key
+            }
+
+    def _switch_to_fallback(self) -> bool:
+        """Switch to fallback account if available. Returns True if switched."""
+        if self.has_fallback and not self.using_fallback:
+            self.using_fallback = True
+            self._update_headers()
+            logger.info("Switched to fallback Taddy account")
+            return True
+        return False
+
+    async def _graphql_query(self, query: str, variables: Dict = None, retry_on_fallback: bool = True) -> Optional[Dict]:
+        """Execute a GraphQL query against the Taddy API
+
+        If the primary account fails with a 500 error (quota exceeded),
+        automatically retries with the fallback account if configured.
+        """
+        try:
+            payload = {"query": query}
+            if variables:
+                payload["variables"] = variables
+
+            response = await http_client.post(
+                self.BASE_URL,
+                json=payload,
+                headers=self.headers
+            )
+
+            data = response.json()
+
+            if "errors" in data:
+                logger.error(f"Taddy API error: {data['errors']}")
+                return None
+
+            return data.get("data")
+
+        except Exception as e:
+            error_str = str(e).lower()
+            # Check for 500 error (quota exceeded) - http_client raises ServiceError
+            if "500" in error_str or "server error" in error_str:
+                account_type = "fallback" if self.using_fallback else "primary"
+                logger.warning(f"Taddy API returned 500 on {account_type} account (likely quota exceeded)")
+
+                # Try fallback if available and we haven't already
+                if retry_on_fallback and self._switch_to_fallback():
+                    logger.info("Retrying with fallback Taddy account...")
+                    return await self._graphql_query(query, variables, retry_on_fallback=False)
+
+            logger.error(f"Taddy API request failed: {e}")
+            return None
+
+    async def search_podcast_appearances(
+        self,
+        celebrity_name: str,
+        lookback_days: int = 730,  # 2 years
+        lookahead_days: int = 30,
+        limit: int = 25,
+        max_pages: int = 10
+    ) -> List[Dict]:
+        """
+        Search for podcast episodes featuring a celebrity.
+
+        Args:
+            celebrity_name: Name of the celebrity to search for
+            lookback_days: How many days back to search
+            lookahead_days: How many days forward to search (for scheduled releases)
+            limit: Maximum results per page
+
+        Returns:
+            List of podcast appearance dicts
+        """
+        appearances = []
+
+        # Calculate date range
+        now = datetime.now()
+        start_date = now - timedelta(days=lookback_days)
+        # Convert to Unix timestamp (seconds)
+        start_timestamp = int(start_date.timestamp())
+
+        query = """
+        query SearchPodcastEpisodes($term: String!, $limitPerPage: Int, $page: Int, $filterForPublishedAfter: Int) {
+            search(
+                term: $term,
+                filterForTypes: PODCASTEPISODE,
+                matchBy: EXACT_PHRASE,
+                limitPerPage: $limitPerPage,
+                page: $page,
+                filterForPublishedAfter: $filterForPublishedAfter
+            ) {
+                searchId
+                podcastEpisodes {
+                    uuid
+                    name
+                    description
+                    datePublished
+                    audioUrl
+                    persons {
+                        uuid
+                        name
+                        role
+                    }
+                    podcastSeries {
+                        uuid
+                        name
+                        imageUrl
+                    }
+                    websiteUrl
+                }
+            }
+        }
+        """
+
+        # Paginate through results (max 20 pages API limit, 25 per page = 500 max)
+        # max_pages passed as parameter from config
+        all_episodes = []
+
+        for page in range(1, max_pages + 1):
+            variables = {
+                "term": celebrity_name,
+                "limitPerPage": limit,
+                "page": page,
+                "filterForPublishedAfter": start_timestamp
+            }
+
+            data = await self._graphql_query(query, variables)
+
+            if not data or not data.get("search"):
+                break
+
+            episodes = data["search"].get("podcastEpisodes", [])
+            if not episodes:
+                break  # No more results
+
+            all_episodes.extend(episodes)
+
+            # If we got fewer than limit, we've reached the end
+            if len(episodes) < limit:
+                break
+
+            # Small delay between pages
+            await asyncio.sleep(0.2)
+
+        episodes = all_episodes
+
+        for ep in episodes:
+            try:
+                # Parse the episode data
+                podcast_series = ep.get("podcastSeries", {})
+                ep_name = (ep.get("name") or "")
+                podcast_name = (podcast_series.get("name") or "")
+                name_lower = celebrity_name.lower()
+                name_parts = name_lower.split()
+
+                # ===== USE PERSONS METADATA FOR ACCURATE FILTERING =====
+                # Check if celebrity is listed in the persons array with a role
+                persons = ep.get("persons", []) or []
+                person_match = None
+                credit_type = None
+
+                for person in persons:
+                    person_name = (person.get("name") or "").lower()
+                    # Match full name or last name
+                    if name_lower in person_name or person_name in name_lower:
+                        person_match = person
+                        role = (person.get("role") or "").lower()
+                        # Map Taddy roles to our credit types
+                        if "host" in role:
+                            credit_type = "host"
+                        elif "guest" in role:
+                            credit_type = "guest"
+                        elif role:
+                            credit_type = role  # Use whatever role they have
+                        else:
+                            credit_type = "guest"  # Default to guest if role not specified
+                        break
+                    # Also check by last name for partial matches
+                    elif len(name_parts) >= 2:
+                        last_name = name_parts[-1]
+                        first_name = name_parts[0]
+                        if len(last_name) >= 4 and (last_name in person_name or first_name in person_name):
+                            person_match = person
+                            role = (person.get("role") or "").lower()
+                            if "host" in role:
+                                credit_type = "host"
+                            elif "guest" in role:
+                                credit_type = "guest"
+                            elif role:
+                                credit_type = role
+                            else:
+                                credit_type = "guest"
+                            break
+
+                # If person is in the persons list, include the episode
+                if person_match:
+                    logger.debug(f"Accepting '{ep_name}' - {celebrity_name} listed as {credit_type} in persons metadata")
+                    is_host = (credit_type == "host")
+                else:
+                    # Fallback: check if they're the host via podcast series name
+                    podcast_name_lower = podcast_name.lower()
+                    is_host = name_lower in podcast_name_lower
+                    if not is_host and len(name_parts) >= 2:
+                        last_name = name_parts[-1]
+                        first_name = name_parts[0]
+                        if len(last_name) >= 4:
+                            is_host = (f"with {last_name}" in podcast_name_lower or
+                                       f"with {first_name}" in podcast_name_lower or
+                                       f"{first_name} {last_name}" in podcast_name_lower)
+
+                    if is_host:
+                        credit_type = "host"
+                        logger.debug(f"Accepting '{ep_name}' - host podcast (name in series title)")
+                    else:
+                        # No persons metadata - use WHITELIST approach
+                        # Only accept if title clearly indicates an interview/guest appearance
+                        ep_name_lower = ep_name.lower()
+                        if name_lower not in ep_name_lower:
+                            logger.debug(f"Skipping '{ep_name}' - name not in title")
+                            continue
+
+                        # Check podcast name for news/gossip shows first
+                        garbage_podcast_names = ['news', 'gossip', 'rumor', 'daily', 'trending', 'tmz', 'variety', 'march madness', 'cruz show', 'aesthetic arrest', 'devious maids']
+                        if any(word in podcast_name_lower for word in garbage_podcast_names):
+                            logger.debug(f"Skipping '{ep_name}' - podcast name suggests news/gossip")
+                            continue
+
+                        # Reject listicles (multiple comma-separated topics)
+                        comma_count = ep_name_lower.count(',')
+                        if comma_count >= 3:
+                            logger.debug(f"Skipping '{ep_name}' - listicle format ({comma_count} commas)")
+                            continue
+
+                        # WHITELIST: Only accept if title matches clear interview patterns
+                        interview_patterns = [
+                            # Direct interview indicators
+                            rf'(interview|interviews|interviewing)\s+(with\s+)?{re.escape(name_lower)}',
+                            rf'{re.escape(name_lower)}\s+(interview|interviewed)',
+                            # Guest indicators
+                            rf'(guest|featuring|feat\.?|ft\.?|with guest|special guest)[:\s]+{re.escape(name_lower)}',
+                            rf'{re.escape(name_lower)}\s+(joins|joined|stops by|sits down|talks|speaks|discusses|shares|reveals|opens up|gets real|gets honest)',
+                            # "Name on Topic" format (common interview title)
+                            rf'^{re.escape(name_lower)}\s+on\s+',
+                            # Episode number + name format ("Ep 123: Name...")
+                            rf'^(ep\.?|episode|#)\s*\d+[:\s]+{re.escape(name_lower)}',
+                            # Name at start followed by colon or dash (interview format)
+                            rf'^{re.escape(name_lower)}\s*[:\-–—]\s*',
+                            # "Conversation with Name"
+                            rf'(conversation|chat|talk|talking|speaking)\s+with\s+{re.escape(name_lower)}',
+                            # "Name Returns" / "Name is Back"
+                            rf'{re.escape(name_lower)}\s+(returns|is back|comes back)',
+                            # Q&A format
+                            rf'(q&a|q\s*&\s*a|ama)\s+(with\s+)?{re.escape(name_lower)}',
+                            # Podcast-specific patterns
+                            rf'{re.escape(name_lower)}\s+(live|in studio|in the studio|on the show|on the pod)',
+                        ]
+
+                        is_interview = False
+                        for pattern in interview_patterns:
+                            if re.search(pattern, ep_name_lower):
+                                is_interview = True
+                                logger.debug(f"Accepting '{ep_name}' - matches interview pattern")
+                                break
+
+                        if not is_interview:
+                            logger.debug(f"Skipping '{ep_name}' - no interview pattern match (name just mentioned)")
+                            continue
+
+                        credit_type = "guest"
+
+                # Get the artwork URL from podcast series
+                artwork_url = podcast_series.get("imageUrl")
+
+                # Parse date
+                date_published = ep.get("datePublished")
+                if date_published:
+                    # Taddy returns Unix timestamp in seconds
+                    try:
+                        pub_date = datetime.fromtimestamp(date_published)
+                        appearance_date = pub_date.strftime("%Y-%m-%d")
+                        status = "upcoming" if pub_date.date() > now.date() else "aired"
+                    except (ValueError, TypeError):
+                        appearance_date = None
+                        status = "aired"
+                else:
+                    appearance_date = None
+                    status = "aired"
+
+                # Get episode URL
+                episode_url = ep.get("websiteUrl")
+
+                appearance = {
+                    "appearance_type": "Podcast",
+                    "show_name": podcast_series.get("name", "Unknown Podcast"),
+                    "episode_title": ep.get("name"),
+                    "appearance_date": appearance_date,
+                    "status": status,
+                    "description": strip_html(ep.get("description")),
+                    "poster_url": artwork_url,
+                    "audio_url": ep.get("audioUrl"),
+                    "url": episode_url,
+                    "credit_type": credit_type or ("host" if is_host else "guest"),
+                    "character_name": "Self",
+                    "taddy_episode_uuid": ep.get("uuid"),
+                    "taddy_podcast_uuid": podcast_series.get("uuid"),
+                    "duration_seconds": None,  # Duration removed from query to reduce complexity
+                }
+
+                appearances.append(appearance)
+                logger.info(f"Found podcast appearance: {celebrity_name} on '{podcast_series.get('name')}' - {ep.get('name')}")
+
+            except Exception as e:
+                logger.error(f"Error parsing Taddy episode: {e}")
+                continue
+
+        return appearances
+
+    async def test_connection(self) -> bool:
+        """Test if the API credentials are valid"""
+        query = """
+        query TestConnection {
+            search(term: "test", filterForTypes: PODCASTSERIES, limitPerPage: 1) {
+                searchId
+            }
+        }
+        """
+
+        data = await self._graphql_query(query)
+        return data is not None