"""Taddy Podcast API client for finding podcast appearances""" import asyncio import re from html import unescape from datetime import datetime, timedelta from typing import Dict, List, Optional from web.backend.core.http_client import http_client from modules.universal_logger import get_logger logger = get_logger('Taddy') def strip_html(text: str) -> str: """Strip HTML tags and decode entities from text""" if not text: return text # Remove HTML tags clean = re.sub(r'<[^>]+>', ' ', text) # Decode HTML entities clean = unescape(clean) # Normalize whitespace clean = re.sub(r'\s+', ' ', clean).strip() return clean class TaddyClient: """Client for interacting with the Taddy Podcast API (GraphQL) Supports primary and fallback accounts for quota management. When the primary account fails (500 error / quota exceeded), automatically switches to the fallback account. """ BASE_URL = "https://api.taddy.org" def __init__(self, user_id: str, api_key: str, user_id_2: str = None, api_key_2: str = None): # Primary account self.user_id = user_id self.api_key = api_key # Fallback account (optional) self.user_id_2 = user_id_2 self.api_key_2 = api_key_2 self.has_fallback = bool(user_id_2 and api_key_2) # Track which account is active self.using_fallback = False self._update_headers() def _update_headers(self): """Update headers based on current active account""" if self.using_fallback and self.has_fallback: self.headers = { "Content-Type": "application/json", "X-USER-ID": self.user_id_2, "X-API-KEY": self.api_key_2 } else: self.headers = { "Content-Type": "application/json", "X-USER-ID": self.user_id, "X-API-KEY": self.api_key } def _switch_to_fallback(self) -> bool: """Switch to fallback account if available. Returns True if switched.""" if self.has_fallback and not self.using_fallback: self.using_fallback = True self._update_headers() logger.info("Switched to fallback Taddy account") return True return False async def _graphql_query(self, query: str, variables: Dict = None, retry_on_fallback: bool = True) -> Optional[Dict]: """Execute a GraphQL query against the Taddy API If the primary account fails with a 500 error (quota exceeded), automatically retries with the fallback account if configured. """ try: payload = {"query": query} if variables: payload["variables"] = variables response = await http_client.post( self.BASE_URL, json=payload, headers=self.headers ) data = response.json() if "errors" in data: logger.error(f"Taddy API error: {data['errors']}") return None return data.get("data") except Exception as e: error_str = str(e).lower() # Check for 500 error (quota exceeded) - http_client raises ServiceError if "500" in error_str or "server error" in error_str: account_type = "fallback" if self.using_fallback else "primary" logger.warning(f"Taddy API returned 500 on {account_type} account (likely quota exceeded)") # Try fallback if available and we haven't already if retry_on_fallback and self._switch_to_fallback(): logger.info("Retrying with fallback Taddy account...") return await self._graphql_query(query, variables, retry_on_fallback=False) logger.error(f"Taddy API request failed: {e}") return None async def search_podcast_appearances( self, celebrity_name: str, lookback_days: int = 730, # 2 years lookahead_days: int = 30, limit: int = 25, max_pages: int = 10 ) -> List[Dict]: """ Search for podcast episodes featuring a celebrity. Args: celebrity_name: Name of the celebrity to search for lookback_days: How many days back to search lookahead_days: How many days forward to search (for scheduled releases) limit: Maximum results per page Returns: List of podcast appearance dicts """ appearances = [] # Calculate date range now = datetime.now() start_date = now - timedelta(days=lookback_days) # Convert to Unix timestamp (seconds) start_timestamp = int(start_date.timestamp()) query = """ query SearchPodcastEpisodes($term: String!, $limitPerPage: Int, $page: Int, $filterForPublishedAfter: Int) { search( term: $term, filterForTypes: PODCASTEPISODE, matchBy: EXACT_PHRASE, limitPerPage: $limitPerPage, page: $page, filterForPublishedAfter: $filterForPublishedAfter ) { searchId podcastEpisodes { uuid name description datePublished audioUrl persons { uuid name role } podcastSeries { uuid name imageUrl } websiteUrl } } } """ # Paginate through results (max 20 pages API limit, 25 per page = 500 max) # max_pages passed as parameter from config all_episodes = [] for page in range(1, max_pages + 1): variables = { "term": celebrity_name, "limitPerPage": limit, "page": page, "filterForPublishedAfter": start_timestamp } data = await self._graphql_query(query, variables) if not data or not data.get("search"): break episodes = data["search"].get("podcastEpisodes", []) if not episodes: break # No more results all_episodes.extend(episodes) # If we got fewer than limit, we've reached the end if len(episodes) < limit: break # Small delay between pages await asyncio.sleep(0.2) episodes = all_episodes for ep in episodes: try: # Parse the episode data podcast_series = ep.get("podcastSeries", {}) ep_name = (ep.get("name") or "") podcast_name = (podcast_series.get("name") or "") name_lower = celebrity_name.lower() name_parts = name_lower.split() # ===== USE PERSONS METADATA FOR ACCURATE FILTERING ===== # Check if celebrity is listed in the persons array with a role persons = ep.get("persons", []) or [] person_match = None credit_type = None for person in persons: person_name = (person.get("name") or "").lower() # Match full name or last name if name_lower in person_name or person_name in name_lower: person_match = person role = (person.get("role") or "").lower() # Map Taddy roles to our credit types if "host" in role: credit_type = "host" elif "guest" in role: credit_type = "guest" elif role: credit_type = role # Use whatever role they have else: credit_type = "guest" # Default to guest if role not specified break # Also check by last name for partial matches elif len(name_parts) >= 2: last_name = name_parts[-1] first_name = name_parts[0] if len(last_name) >= 4 and (last_name in person_name or first_name in person_name): person_match = person role = (person.get("role") or "").lower() if "host" in role: credit_type = "host" elif "guest" in role: credit_type = "guest" elif role: credit_type = role else: credit_type = "guest" break # If person is in the persons list, include the episode if person_match: logger.debug(f"Accepting '{ep_name}' - {celebrity_name} listed as {credit_type} in persons metadata") is_host = (credit_type == "host") else: # Fallback: check if they're the host via podcast series name podcast_name_lower = podcast_name.lower() is_host = name_lower in podcast_name_lower if not is_host and len(name_parts) >= 2: last_name = name_parts[-1] first_name = name_parts[0] if len(last_name) >= 4: is_host = (f"with {last_name}" in podcast_name_lower or f"with {first_name}" in podcast_name_lower or f"{first_name} {last_name}" in podcast_name_lower) if is_host: credit_type = "host" logger.debug(f"Accepting '{ep_name}' - host podcast (name in series title)") else: # No persons metadata - use WHITELIST approach # Only accept if title clearly indicates an interview/guest appearance ep_name_lower = ep_name.lower() if name_lower not in ep_name_lower: logger.debug(f"Skipping '{ep_name}' - name not in title") continue # Check podcast name for news/gossip shows first garbage_podcast_names = ['news', 'gossip', 'rumor', 'daily', 'trending', 'tmz', 'variety', 'march madness', 'cruz show', 'aesthetic arrest', 'devious maids'] if any(word in podcast_name_lower for word in garbage_podcast_names): logger.debug(f"Skipping '{ep_name}' - podcast name suggests news/gossip") continue # Reject listicles (multiple comma-separated topics) comma_count = ep_name_lower.count(',') if comma_count >= 3: logger.debug(f"Skipping '{ep_name}' - listicle format ({comma_count} commas)") continue # WHITELIST: Only accept if title matches clear interview patterns interview_patterns = [ # Direct interview indicators rf'(interview|interviews|interviewing)\s+(with\s+)?{re.escape(name_lower)}', rf'{re.escape(name_lower)}\s+(interview|interviewed)', # Guest indicators rf'(guest|featuring|feat\.?|ft\.?|with guest|special guest)[:\s]+{re.escape(name_lower)}', rf'{re.escape(name_lower)}\s+(joins|joined|stops by|sits down|talks|speaks|discusses|shares|reveals|opens up|gets real|gets honest)', # "Name on Topic" format (common interview title) rf'^{re.escape(name_lower)}\s+on\s+', # Episode number + name format ("Ep 123: Name...") rf'^(ep\.?|episode|#)\s*\d+[:\s]+{re.escape(name_lower)}', # Name at start followed by colon or dash (interview format) rf'^{re.escape(name_lower)}\s*[:\-–—]\s*', # "Conversation with Name" rf'(conversation|chat|talk|talking|speaking)\s+with\s+{re.escape(name_lower)}', # "Name Returns" / "Name is Back" rf'{re.escape(name_lower)}\s+(returns|is back|comes back)', # Q&A format rf'(q&a|q\s*&\s*a|ama)\s+(with\s+)?{re.escape(name_lower)}', # Podcast-specific patterns rf'{re.escape(name_lower)}\s+(live|in studio|in the studio|on the show|on the pod)', ] is_interview = False for pattern in interview_patterns: if re.search(pattern, ep_name_lower): is_interview = True logger.debug(f"Accepting '{ep_name}' - matches interview pattern") break if not is_interview: logger.debug(f"Skipping '{ep_name}' - no interview pattern match (name just mentioned)") continue credit_type = "guest" # Get the artwork URL from podcast series artwork_url = podcast_series.get("imageUrl") # Parse date date_published = ep.get("datePublished") if date_published: # Taddy returns Unix timestamp in seconds try: pub_date = datetime.fromtimestamp(date_published) appearance_date = pub_date.strftime("%Y-%m-%d") status = "upcoming" if pub_date.date() > now.date() else "aired" except (ValueError, TypeError): appearance_date = None status = "aired" else: appearance_date = None status = "aired" # Get episode URL episode_url = ep.get("websiteUrl") appearance = { "appearance_type": "Podcast", "show_name": podcast_series.get("name", "Unknown Podcast"), "episode_title": ep.get("name"), "appearance_date": appearance_date, "status": status, "description": strip_html(ep.get("description")), "poster_url": artwork_url, "audio_url": ep.get("audioUrl"), "url": episode_url, "credit_type": credit_type or ("host" if is_host else "guest"), "character_name": "Self", "taddy_episode_uuid": ep.get("uuid"), "taddy_podcast_uuid": podcast_series.get("uuid"), "duration_seconds": None, # Duration removed from query to reduce complexity } appearances.append(appearance) logger.info(f"Found podcast appearance: {celebrity_name} on '{podcast_series.get('name')}' - {ep.get('name')}") except Exception as e: logger.error(f"Error parsing Taddy episode: {e}") continue return appearances async def test_connection(self) -> bool: """Test if the API credentials are valid""" query = """ query TestConnection { search(term: "test", filterForTypes: PODCASTSERIES, limitPerPage: 1) { searchId } } """ data = await self._graphql_query(query) return data is not None