Source code for ractogateway.pipelines.video_processor._summarizer

"""Comprehensive summary generation for VideoProcessorPipeline.

Combines visual frame analyses + audio transcripts → one structured summary
covering whiteboard equations, screen content, and spoken explanations.
"""

from __future__ import annotations

from typing import Any

from ._models import TranscriptSegment, VideoProcessorUsage, VideoSection


def _chat_with_prompt_sync(kit: Any, *, prompt: Any, user_message: str) -> Any:  # noqa: ANN401
    """Call ``kit.chat`` with modern ChatConfig, with legacy fallback."""
    from ractogateway._models.chat import ChatConfig  # noqa: PLC0415

    try:
        return kit.chat(ChatConfig(user_message=user_message, prompt=prompt))
    except TypeError:
        return kit.chat(prompt=prompt)


async def _chat_with_prompt_async(
    kit: Any,
    *,
    prompt: Any,
    user_message: str,
) -> Any:  # noqa: ANN401
    """Call ``kit.achat`` with modern ChatConfig, with legacy fallback."""
    from ractogateway._models.chat import ChatConfig  # noqa: PLC0415

    try:
        return await kit.achat(ChatConfig(user_message=user_message, prompt=prompt))
    except TypeError:
        return await kit.achat(prompt=prompt)

# ---------------------------------------------------------------------------
# Summary prompt
# ---------------------------------------------------------------------------

_SUMMARY_SYSTEM_PROMPT = """\
You are an expert at analysing recorded tutorial and lecture videos.
You will receive a chronological log of:
  • Visual content (what was written on the whiteboard/board, shown on screen)
  • Audio transcript (what the presenter said)

Your task is to generate a comprehensive, structured summary. The summary MUST include:

1. **Overview** - What is this video about? (2-3 sentences)
2. **Key Topics Covered** — Bulleted list of main subjects
3. **Whiteboard / Board Content** — ALL equations, formulas, proofs, diagrams described \
verbatim. Group related items together. Use LaTeX-style notation where helpful.
4. **Screen / Slide Content** — ALL text, code, charts or diagrams shown on screen
5. **Detailed Explanation** — A section-by-section walkthrough aligned with timestamps, \
combining what was said and what was shown
6. **Key Concepts & Definitions** — Important terms and their meanings as explained
7. **Conclusions / Takeaways** — What should the viewer remember?

Be thorough. Do not omit any equation or formula from the board content section."""


def _build_context(
    sections: list[VideoSection],
    transcript: list[TranscriptSegment],
    max_chars: int = 80_000,
) -> str:
    """Build the LLM context string from sections + transcript."""
    lines: list[str] = []

    # Timeline sections (visual + audio merged)
    if sections:
        lines.append("=== TIMELINE (visual + audio by timestamp) ===\n")
        for sec in sections:
            lines.append(
                f"[{sec.timestamp_start:.1f}s -{sec.timestamp_end:.1f}s]"
            )
            if sec.visual_content:
                lines.append(f"  VISUAL: {sec.visual_content}")
            if sec.audio_content:
                lines.append(f"  AUDIO:  {sec.audio_content}")
            lines.append("")

    # Full transcript (for context if no sections)
    if transcript and not sections:
        lines.append("=== FULL TRANSCRIPT ===\n")
        for seg in transcript:
            lines.append(f"[{seg.start:.1f}s] {seg.text}")

    context = "\n".join(lines)
    # Trim if too long
    if len(context) > max_chars:
        context = context[:max_chars] + "\n\n[… context truncated for length …]"
    return context



[docs]
def generate_summary_sync(
    sections: list[VideoSection],
    transcript: list[TranscriptSegment],
    kit: Any,
    usage: VideoProcessorUsage,
) -> str:
    """Generate summary synchronously using *kit*.

    Updates *usage* with summary token counts. Returns the summary string.
    """
    from ractogateway.prompts.engine import RactoPrompt  # noqa: PLC0415

    context = _build_context(sections, transcript)
    if not context.strip():
        return "(No content extracted from this video.)"

    prompt = RactoPrompt(
        role="expert lecture and tutorial summariser",
        aim=_SUMMARY_SYSTEM_PROMPT,
        constraints=[
            "Include every equation and formula exactly as it appeared",
            "Preserve mathematical notation faithfully",
            "Organise by the 7 numbered sections listed above",
            "Use Markdown formatting with headers and bullet points",
        ],
        tone="Professional and comprehensive.",
        output_format="A structured Markdown document with all 7 required sections",
        context=context,
    )

    response = _chat_with_prompt_sync(
        kit,
        prompt=prompt,
        user_message="Generate a comprehensive summary from the provided timeline context.",
    )
    usage.summary_input_tokens += (response.usage or {}).get("prompt_tokens", 0)
    usage.summary_output_tokens += (response.usage or {}).get("completion_tokens", 0)
    return response.content or "(Summary generation returned no content.)"




[docs]
async def generate_summary_async(
    sections: list[VideoSection],
    transcript: list[TranscriptSegment],
    kit: Any,
    usage: VideoProcessorUsage,
) -> str:
    """Async variant of :func:`generate_summary_sync`."""
    from ractogateway.prompts.engine import RactoPrompt  # noqa: PLC0415

    context = _build_context(sections, transcript)
    if not context.strip():
        return "(No content extracted from this video.)"

    prompt = RactoPrompt(
        role="expert lecture and tutorial summariser",
        aim=_SUMMARY_SYSTEM_PROMPT,
        constraints=[
            "Include every equation and formula exactly as it appeared",
            "Preserve mathematical notation faithfully",
            "Organise by the 7 numbered sections listed above",
            "Use Markdown formatting with headers and bullet points",
        ],
        tone="Professional and comprehensive.",
        output_format="A structured Markdown document with all 7 required sections",
        context=context,
    )

    response = await _chat_with_prompt_async(
        kit,
        prompt=prompt,
        user_message="Generate a comprehensive summary from the provided timeline context.",
    )
    usage.summary_input_tokens += (response.usage or {}).get("prompt_tokens", 0)
    usage.summary_output_tokens += (response.usage or {}).get("completion_tokens", 0)
    return response.content or "(Summary generation returned no content.)"



# ---------------------------------------------------------------------------
# Section builder (merges frames + transcript into VideoSections)
# ---------------------------------------------------------------------------



[docs]
def build_sections(
    frames: list,  # list[FrameEntry]
    transcript: list[TranscriptSegment],
) -> list[VideoSection]:
    """Merge visual frame analyses and transcript segments into VideoSections.

    Each transcript segment becomes one section; frames are matched by
    timestamp overlap.  If there are no transcript segments, each kept
    frame becomes its own section.
    """
    from ._models import VideoSection  # noqa: PLC0415

    kept_frames = [f for f in frames if f.kept]

    if transcript:
        sections: list[VideoSection] = []
        for seg in transcript:
            matching_ids = [
                f.frame_id
                for f in kept_frames
                if seg.start <= f.timestamp <= seg.end
            ]
            visual = "\n\n".join(
                f.analysis
                for f in kept_frames
                if f.frame_id in matching_ids and f.analysis
            )
            sections.append(
                VideoSection(
                    timestamp_start=seg.start,
                    timestamp_end=seg.end,
                    frame_ids=matching_ids,
                    visual_content=visual,
                    audio_content=seg.text,
                )
            )
        return sections

    # No transcript — one section per frame
    return [
        VideoSection(
            timestamp_start=f.timestamp,
            timestamp_end=f.timestamp,
            frame_ids=[f.frame_id],
            visual_content=f.analysis or "",
            audio_content="",
        )
        for f in kept_frames
    ]