Initial commit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Todd
2026-03-29 22:42:55 -04:00
commit 0d7b2b1aab
389 changed files with 280296 additions and 0 deletions

391
modules/taddy_client.py Normal file
View File

@@ -0,0 +1,391 @@
"""Taddy Podcast API client for finding podcast appearances"""
import asyncio
import re
from html import unescape
from datetime import datetime, timedelta
from typing import Dict, List, Optional
from web.backend.core.http_client import http_client
from modules.universal_logger import get_logger
logger = get_logger('Taddy')
def strip_html(text: str) -> str:
"""Strip HTML tags and decode entities from text"""
if not text:
return text
# Remove HTML tags
clean = re.sub(r'<[^>]+>', ' ', text)
# Decode HTML entities
clean = unescape(clean)
# Normalize whitespace
clean = re.sub(r'\s+', ' ', clean).strip()
return clean
class TaddyClient:
"""Client for interacting with the Taddy Podcast API (GraphQL)
Supports primary and fallback accounts for quota management.
When the primary account fails (500 error / quota exceeded),
automatically switches to the fallback account.
"""
BASE_URL = "https://api.taddy.org"
def __init__(self, user_id: str, api_key: str,
user_id_2: str = None, api_key_2: str = None):
# Primary account
self.user_id = user_id
self.api_key = api_key
# Fallback account (optional)
self.user_id_2 = user_id_2
self.api_key_2 = api_key_2
self.has_fallback = bool(user_id_2 and api_key_2)
# Track which account is active
self.using_fallback = False
self._update_headers()
def _update_headers(self):
"""Update headers based on current active account"""
if self.using_fallback and self.has_fallback:
self.headers = {
"Content-Type": "application/json",
"X-USER-ID": self.user_id_2,
"X-API-KEY": self.api_key_2
}
else:
self.headers = {
"Content-Type": "application/json",
"X-USER-ID": self.user_id,
"X-API-KEY": self.api_key
}
def _switch_to_fallback(self) -> bool:
"""Switch to fallback account if available. Returns True if switched."""
if self.has_fallback and not self.using_fallback:
self.using_fallback = True
self._update_headers()
logger.info("Switched to fallback Taddy account")
return True
return False
async def _graphql_query(self, query: str, variables: Dict = None, retry_on_fallback: bool = True) -> Optional[Dict]:
"""Execute a GraphQL query against the Taddy API
If the primary account fails with a 500 error (quota exceeded),
automatically retries with the fallback account if configured.
"""
try:
payload = {"query": query}
if variables:
payload["variables"] = variables
response = await http_client.post(
self.BASE_URL,
json=payload,
headers=self.headers
)
data = response.json()
if "errors" in data:
logger.error(f"Taddy API error: {data['errors']}")
return None
return data.get("data")
except Exception as e:
error_str = str(e).lower()
# Check for 500 error (quota exceeded) - http_client raises ServiceError
if "500" in error_str or "server error" in error_str:
account_type = "fallback" if self.using_fallback else "primary"
logger.warning(f"Taddy API returned 500 on {account_type} account (likely quota exceeded)")
# Try fallback if available and we haven't already
if retry_on_fallback and self._switch_to_fallback():
logger.info("Retrying with fallback Taddy account...")
return await self._graphql_query(query, variables, retry_on_fallback=False)
logger.error(f"Taddy API request failed: {e}")
return None
async def search_podcast_appearances(
self,
celebrity_name: str,
lookback_days: int = 730, # 2 years
lookahead_days: int = 30,
limit: int = 25,
max_pages: int = 10
) -> List[Dict]:
"""
Search for podcast episodes featuring a celebrity.
Args:
celebrity_name: Name of the celebrity to search for
lookback_days: How many days back to search
lookahead_days: How many days forward to search (for scheduled releases)
limit: Maximum results per page
Returns:
List of podcast appearance dicts
"""
appearances = []
# Calculate date range
now = datetime.now()
start_date = now - timedelta(days=lookback_days)
# Convert to Unix timestamp (seconds)
start_timestamp = int(start_date.timestamp())
query = """
query SearchPodcastEpisodes($term: String!, $limitPerPage: Int, $page: Int, $filterForPublishedAfter: Int) {
search(
term: $term,
filterForTypes: PODCASTEPISODE,
matchBy: EXACT_PHRASE,
limitPerPage: $limitPerPage,
page: $page,
filterForPublishedAfter: $filterForPublishedAfter
) {
searchId
podcastEpisodes {
uuid
name
description
datePublished
audioUrl
persons {
uuid
name
role
}
podcastSeries {
uuid
name
imageUrl
}
websiteUrl
}
}
}
"""
# Paginate through results (max 20 pages API limit, 25 per page = 500 max)
# max_pages passed as parameter from config
all_episodes = []
for page in range(1, max_pages + 1):
variables = {
"term": celebrity_name,
"limitPerPage": limit,
"page": page,
"filterForPublishedAfter": start_timestamp
}
data = await self._graphql_query(query, variables)
if not data or not data.get("search"):
break
episodes = data["search"].get("podcastEpisodes", [])
if not episodes:
break # No more results
all_episodes.extend(episodes)
# If we got fewer than limit, we've reached the end
if len(episodes) < limit:
break
# Small delay between pages
await asyncio.sleep(0.2)
episodes = all_episodes
for ep in episodes:
try:
# Parse the episode data
podcast_series = ep.get("podcastSeries", {})
ep_name = (ep.get("name") or "")
podcast_name = (podcast_series.get("name") or "")
name_lower = celebrity_name.lower()
name_parts = name_lower.split()
# ===== USE PERSONS METADATA FOR ACCURATE FILTERING =====
# Check if celebrity is listed in the persons array with a role
persons = ep.get("persons", []) or []
person_match = None
credit_type = None
for person in persons:
person_name = (person.get("name") or "").lower()
# Match full name or last name
if name_lower in person_name or person_name in name_lower:
person_match = person
role = (person.get("role") or "").lower()
# Map Taddy roles to our credit types
if "host" in role:
credit_type = "host"
elif "guest" in role:
credit_type = "guest"
elif role:
credit_type = role # Use whatever role they have
else:
credit_type = "guest" # Default to guest if role not specified
break
# Also check by last name for partial matches
elif len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = name_parts[0]
if len(last_name) >= 4 and (last_name in person_name or first_name in person_name):
person_match = person
role = (person.get("role") or "").lower()
if "host" in role:
credit_type = "host"
elif "guest" in role:
credit_type = "guest"
elif role:
credit_type = role
else:
credit_type = "guest"
break
# If person is in the persons list, include the episode
if person_match:
logger.debug(f"Accepting '{ep_name}' - {celebrity_name} listed as {credit_type} in persons metadata")
is_host = (credit_type == "host")
else:
# Fallback: check if they're the host via podcast series name
podcast_name_lower = podcast_name.lower()
is_host = name_lower in podcast_name_lower
if not is_host and len(name_parts) >= 2:
last_name = name_parts[-1]
first_name = name_parts[0]
if len(last_name) >= 4:
is_host = (f"with {last_name}" in podcast_name_lower or
f"with {first_name}" in podcast_name_lower or
f"{first_name} {last_name}" in podcast_name_lower)
if is_host:
credit_type = "host"
logger.debug(f"Accepting '{ep_name}' - host podcast (name in series title)")
else:
# No persons metadata - use WHITELIST approach
# Only accept if title clearly indicates an interview/guest appearance
ep_name_lower = ep_name.lower()
if name_lower not in ep_name_lower:
logger.debug(f"Skipping '{ep_name}' - name not in title")
continue
# Check podcast name for news/gossip shows first
garbage_podcast_names = ['news', 'gossip', 'rumor', 'daily', 'trending', 'tmz', 'variety', 'march madness', 'cruz show', 'aesthetic arrest', 'devious maids']
if any(word in podcast_name_lower for word in garbage_podcast_names):
logger.debug(f"Skipping '{ep_name}' - podcast name suggests news/gossip")
continue
# Reject listicles (multiple comma-separated topics)
comma_count = ep_name_lower.count(',')
if comma_count >= 3:
logger.debug(f"Skipping '{ep_name}' - listicle format ({comma_count} commas)")
continue
# WHITELIST: Only accept if title matches clear interview patterns
interview_patterns = [
# Direct interview indicators
rf'(interview|interviews|interviewing)\s+(with\s+)?{re.escape(name_lower)}',
rf'{re.escape(name_lower)}\s+(interview|interviewed)',
# Guest indicators
rf'(guest|featuring|feat\.?|ft\.?|with guest|special guest)[:\s]+{re.escape(name_lower)}',
rf'{re.escape(name_lower)}\s+(joins|joined|stops by|sits down|talks|speaks|discusses|shares|reveals|opens up|gets real|gets honest)',
# "Name on Topic" format (common interview title)
rf'^{re.escape(name_lower)}\s+on\s+',
# Episode number + name format ("Ep 123: Name...")
rf'^(ep\.?|episode|#)\s*\d+[:\s]+{re.escape(name_lower)}',
# Name at start followed by colon or dash (interview format)
rf'^{re.escape(name_lower)}\s*[:\-–—]\s*',
# "Conversation with Name"
rf'(conversation|chat|talk|talking|speaking)\s+with\s+{re.escape(name_lower)}',
# "Name Returns" / "Name is Back"
rf'{re.escape(name_lower)}\s+(returns|is back|comes back)',
# Q&A format
rf'(q&a|q\s*&\s*a|ama)\s+(with\s+)?{re.escape(name_lower)}',
# Podcast-specific patterns
rf'{re.escape(name_lower)}\s+(live|in studio|in the studio|on the show|on the pod)',
]
is_interview = False
for pattern in interview_patterns:
if re.search(pattern, ep_name_lower):
is_interview = True
logger.debug(f"Accepting '{ep_name}' - matches interview pattern")
break
if not is_interview:
logger.debug(f"Skipping '{ep_name}' - no interview pattern match (name just mentioned)")
continue
credit_type = "guest"
# Get the artwork URL from podcast series
artwork_url = podcast_series.get("imageUrl")
# Parse date
date_published = ep.get("datePublished")
if date_published:
# Taddy returns Unix timestamp in seconds
try:
pub_date = datetime.fromtimestamp(date_published)
appearance_date = pub_date.strftime("%Y-%m-%d")
status = "upcoming" if pub_date.date() > now.date() else "aired"
except (ValueError, TypeError):
appearance_date = None
status = "aired"
else:
appearance_date = None
status = "aired"
# Get episode URL
episode_url = ep.get("websiteUrl")
appearance = {
"appearance_type": "Podcast",
"show_name": podcast_series.get("name", "Unknown Podcast"),
"episode_title": ep.get("name"),
"appearance_date": appearance_date,
"status": status,
"description": strip_html(ep.get("description")),
"poster_url": artwork_url,
"audio_url": ep.get("audioUrl"),
"url": episode_url,
"credit_type": credit_type or ("host" if is_host else "guest"),
"character_name": "Self",
"taddy_episode_uuid": ep.get("uuid"),
"taddy_podcast_uuid": podcast_series.get("uuid"),
"duration_seconds": None, # Duration removed from query to reduce complexity
}
appearances.append(appearance)
logger.info(f"Found podcast appearance: {celebrity_name} on '{podcast_series.get('name')}' - {ep.get('name')}")
except Exception as e:
logger.error(f"Error parsing Taddy episode: {e}")
continue
return appearances
async def test_connection(self) -> bool:
"""Test if the API credentials are valid"""
query = """
query TestConnection {
search(term: "test", filterForTypes: PODCASTSERIES, limitPerPage: 1) {
searchId
}
}
"""
data = await self._graphql_query(query)
return data is not None