Files
media-downloader/modules/taddy_client.py
Todd 0d7b2b1aab Initial commit
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-29 22:42:55 -04:00

392 lines
16 KiB
Python

"""Taddy Podcast API client for finding podcast appearances"""
import asyncio
import re
from html import unescape
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from web.backend.core.http_client import http_client
from modules.universal_logger import get_logger
logger = get_logger('Taddy')
def strip_html(text: str) -> str:
"""Strip HTML tags and decode entities from text"""
if not text:
return text
# Remove HTML tags
clean = re.sub(r'<[^>]+>', ' ', text)
# Decode HTML entities
clean = unescape(clean)
# Normalize whitespace
clean = re.sub(r'\s+', ' ', clean).strip()
return clean
class TaddyClient:
"""Client for interacting with the Taddy Podcast API (GraphQL)
Supports primary and fallback accounts for quota management.
When the primary account fails (500 error / quota exceeded),
automatically switches to the fallback account.
"""
BASE_URL = "https://api.taddy.org"
def __init__(self, user_id: str, api_key: str,
user_id_2: str = None, api_key_2: str = None):
# Primary account
self.user_id = user_id
self.api_key = api_key
# Fallback account (optional)
self.user_id_2 = user_id_2
self.api_key_2 = api_key_2
self.has_fallback = bool(user_id_2 and api_key_2)
# Track which account is active
self.using_fallback = False
self._update_headers()
def _update_headers(self):
"""Update headers based on current active account"""
if self.using_fallback and self.has_fallback:
self.headers = {
"Content-Type": "application/json",
"X-USER-ID": self.user_id_2,
"X-API-KEY": self.api_key_2
}
else:
self.headers = {
"Content-Type": "application/json",
"X-USER-ID": self.user_id,
"X-API-KEY": self.api_key
}
def _switch_to_fallback(self) -> bool:
"""Switch to fallback account if available. Returns True if switched."""
if self.has_fallback and not self.using_fallback:
self.using_fallback = True
self._update_headers()
logger.info("Switched to fallback Taddy account")
return True
return False
async def _graphql_query(self, query: str, variables: Dict = None, retry_on_fallback: bool = True) -> Optional[Dict]:
"""Execute a GraphQL query against the Taddy API
If the primary account fails with a 500 error (quota exceeded),
automatically retries with the fallback account if configured.
"""
try:
payload = {"query": query}
if variables:
payload["variables"] = variables
response = await http_client.post(
self.BASE_URL,
json=payload,
headers=self.headers
)
data = response.json()
if "errors" in data:
logger.error(f"Taddy API error: {data['errors']}")
return None
return data.get("data")
except Exception as e:
error_str = str(e).lower()
# Check for 500 error (quota exceeded) - http_client raises ServiceError
if "500" in error_str or "server error" in error_str:
account_type = "fallback" if self.using_fallback else "primary"
logger.warning(f"Taddy API returned 500 on {account_type} account (likely quota exceeded)")
# Try fallback if available and we haven't already
if retry_on_fallback and self._switch_to_fallback():
logger.info("Retrying with fallback Taddy account...")
return await self._graphql_query(query, variables, retry_on_fallback=False)
logger.error(f"Taddy API request failed: {e}")
return None
async def search_podcast_appearances(
self,
celebrity_name: str,
lookback_days: int = 730, # 2 years
lookahead_days: int = 30,
limit: int = 25,
max_pages: int = 10
) -> List[Dict]:
"""
Search for podcast episodes featuring a celebrity.
Args:
celebrity_name: Name of the celebrity to search for
lookback_days: How many days back to search
lookahead_days: How many days forward to search (for scheduled releases)
limit: Maximum results per page
Returns:
List of podcast appearance dicts
"""
appearances = []
# Calculate date range
now = datetime.now()
start_date = now - timedelta(days=lookback_days)
# Convert to Unix timestamp (seconds)
start_timestamp = int(start_date.timestamp())
query = """
query SearchPodcastEpisodes($term: String!, $limitPerPage: Int, $page: Int, $filterForPublishedAfter: Int) {
search(
term: $term,
filterForTypes: PODCASTEPISODE,
matchBy: EXACT_PHRASE,
limitPerPage: $limitPerPage,
page: $page,
filterForPublishedAfter: $filterForPublishedAfter
) {
searchId
podcastEpisodes {
uuid
name
description
datePublished
audioUrl
persons {
uuid
name
role
}
podcastSeries {
uuid
name
imageUrl
}
websiteUrl
}
}
}
"""
# Paginate through results (max 20 pages API limit, 25 per page = 500 max)
# max_pages passed as parameter from config
all_episodes = []
for page in range(1, max_pages + 1):
variables = {
"term": celebrity_name,
"limitPerPage": limit,
"page": page,
"filterForPublishedAfter": start_timestamp
}
data = await self._graphql_query(query, variables)
if not data or not data.get("search"):
break
episodes = data["search"].get("podcastEpisodes", [])
if not episodes:
break # No more results
all_episodes.extend(episodes)
# If we got fewer than limit, we've reached the end
if len(episodes) < limit:
break
# Small delay between pages
await asyncio.sleep(0.2)
episodes = all_episodes
for ep in episodes:
try:
# Parse the episode data
podcast_series = ep.get("podcastSeries", {})
ep_name = (ep.get("name") or "")
podcast_name = (podcast_series.get("name") or "")
name_lower = celebrity_name.lower()
name_parts = name_lower.split()
# ===== USE PERSONS METADATA FOR ACCURATE FILTERING =====
# Check if celebrity is listed in the persons array with a role
persons = ep.get("persons", []) or []
person_match = None
credit_type = None
for person in persons:
person_name = (person.get("name") or "").lower()
# Match full name or last name
if name_lower in person_name or person_name in name_lower:
person_match = person
role = (person.get("role") or "").lower()
# Map Taddy roles to our credit types
if "host" in role:
credit_type = "host"
elif "guest" in role:
credit_type = "guest"
elif role:
credit_type = role # Use whatever role they have
else:
credit_type = "guest" # Default to guest if role not specified
break
# Also check by last name for partial matches
elif len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = name_parts[0]
if len(last_name) >= 4 and (last_name in person_name or first_name in person_name):
person_match = person
role = (person.get("role") or "").lower()
if "host" in role:
credit_type = "host"
elif "guest" in role:
credit_type = "guest"
elif role:
credit_type = role
else:
credit_type = "guest"
break
# If person is in the persons list, include the episode
if person_match:
logger.debug(f"Accepting '{ep_name}' - {celebrity_name} listed as {credit_type} in persons metadata")
is_host = (credit_type == "host")
else:
# Fallback: check if they're the host via podcast series name
podcast_name_lower = podcast_name.lower()
is_host = name_lower in podcast_name_lower
if not is_host and len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = name_parts[0]
if len(last_name) >= 4:
is_host = (f"with {last_name}" in podcast_name_lower or
f"with {first_name}" in podcast_name_lower or
f"{first_name} {last_name}" in podcast_name_lower)
if is_host:
credit_type = "host"
logger.debug(f"Accepting '{ep_name}' - host podcast (name in series title)")
else:
# No persons metadata - use WHITELIST approach
# Only accept if title clearly indicates an interview/guest appearance
ep_name_lower = ep_name.lower()
if name_lower not in ep_name_lower:
logger.debug(f"Skipping '{ep_name}' - name not in title")
continue
# Check podcast name for news/gossip shows first
garbage_podcast_names = ['news', 'gossip', 'rumor', 'daily', 'trending', 'tmz', 'variety', 'march madness', 'cruz show', 'aesthetic arrest', 'devious maids']
if any(word in podcast_name_lower for word in garbage_podcast_names):
logger.debug(f"Skipping '{ep_name}' - podcast name suggests news/gossip")
continue
# Reject listicles (multiple comma-separated topics)
comma_count = ep_name_lower.count(',')
if comma_count >= 3:
logger.debug(f"Skipping '{ep_name}' - listicle format ({comma_count} commas)")
continue
# WHITELIST: Only accept if title matches clear interview patterns
interview_patterns = [
# Direct interview indicators
rf'(interview|interviews|interviewing)\s+(with\s+)?{re.escape(name_lower)}',
rf'{re.escape(name_lower)}\s+(interview|interviewed)',
# Guest indicators
rf'(guest|featuring|feat\.?|ft\.?|with guest|special guest)[:\s]+{re.escape(name_lower)}',
rf'{re.escape(name_lower)}\s+(joins|joined|stops by|sits down|talks|speaks|discusses|shares|reveals|opens up|gets real|gets honest)',
# "Name on Topic" format (common interview title)
rf'^{re.escape(name_lower)}\s+on\s+',
# Episode number + name format ("Ep 123: Name...")
rf'^(ep\.?|episode|#)\s*\d+[:\s]+{re.escape(name_lower)}',
# Name at start followed by colon or dash (interview format)
rf'^{re.escape(name_lower)}\s*[:\-–—]\s*',
# "Conversation with Name"
rf'(conversation|chat|talk|talking|speaking)\s+with\s+{re.escape(name_lower)}',
# "Name Returns" / "Name is Back"
rf'{re.escape(name_lower)}\s+(returns|is back|comes back)',
# Q&A format
rf'(q&a|q\s*&\s*a|ama)\s+(with\s+)?{re.escape(name_lower)}',
# Podcast-specific patterns
rf'{re.escape(name_lower)}\s+(live|in studio|in the studio|on the show|on the pod)',
]
is_interview = False
for pattern in interview_patterns:
if re.search(pattern, ep_name_lower):
is_interview = True
logger.debug(f"Accepting '{ep_name}' - matches interview pattern")
break
if not is_interview:
logger.debug(f"Skipping '{ep_name}' - no interview pattern match (name just mentioned)")
continue
credit_type = "guest"
# Get the artwork URL from podcast series
artwork_url = podcast_series.get("imageUrl")
# Parse date
date_published = ep.get("datePublished")
if date_published:
# Taddy returns Unix timestamp in seconds
try:
pub_date = datetime.fromtimestamp(date_published)
appearance_date = pub_date.strftime("%Y-%m-%d")
status = "upcoming" if pub_date.date() > now.date() else "aired"
except (ValueError, TypeError):
appearance_date = None
status = "aired"
else:
appearance_date = None
status = "aired"
# Get episode URL
episode_url = ep.get("websiteUrl")
appearance = {
"appearance_type": "Podcast",
"show_name": podcast_series.get("name", "Unknown Podcast"),
"episode_title": ep.get("name"),
"appearance_date": appearance_date,
"status": status,
"description": strip_html(ep.get("description")),
"poster_url": artwork_url,
"audio_url": ep.get("audioUrl"),
"url": episode_url,
"credit_type": credit_type or ("host" if is_host else "guest"),
"character_name": "Self",
"taddy_episode_uuid": ep.get("uuid"),
"taddy_podcast_uuid": podcast_series.get("uuid"),
"duration_seconds": None, # Duration removed from query to reduce complexity
}
appearances.append(appearance)
logger.info(f"Found podcast appearance: {celebrity_name} on '{podcast_series.get('name')}' - {ep.get('name')}")
except Exception as e:
logger.error(f"Error parsing Taddy episode: {e}")
continue
return appearances
async def test_connection(self) -> bool:
"""Test if the API credentials are valid"""
query = """
query TestConnection {
search(term: "test", filterForTypes: PODCASTSERIES, limitPerPage: 1) {
searchId
}
}
"""
data = await self._graphql_query(query)
return data is not None