Skip to content

Language

KeywordExtractor

Extracts keywords from subtitles using spaCy.

Parameters:

Name Type Description Default
keywords list[str]

List of keywords to extract.

required

Attributes:

Name Type Description
keywords list[str]

List of keywords to extract.

nlp

spaCy language model for text processing.

lemmatized_keywords set[str]

Set of lemmatized keywords.

Methods:

Name Description
generate_segments

Captures keyword segments from a list of subtitles.

Source code in video_sampler/language/keyword_capture.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class KeywordExtractor:
    """
    Extracts keywords from subtitles using spaCy.

    Args:
        keywords (list[str]): List of keywords to extract.

    Attributes:
        keywords (list[str]): List of keywords to extract.
        nlp: spaCy language model for text processing.
        lemmatized_keywords (set[str]): Set of lemmatized keywords.

    Methods:
        generate_segments: Captures keyword segments from a list of subtitles.

    """

    def __init__(self, keywords: list[str]) -> None:
        try:
            import spacy
        except ImportError as e:
            raise ImportError(
                "To use this feature install spacy by 'pip install spacy'"
            ) from e

        self.keywords = keywords
        self.nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
        self.lemmatized_keywords = {
            tok.lemma_ for tok in self.nlp(" ".join(self.keywords))
        }
        console.print(
            f"Keyword capture initialised with: {keywords}",
            style=f"bold {Color.magenta.value}",
        )

    def generate_segments(
        self, subtitle_list: list[tuple[tuple[int, int], str]]
    ) -> Iterable[subtitle_line]:
        """
        Captures keyword segments from a list of subtitles.

        Args:
            subtitle_list (list[tuple[tuple[int, int], str]]): List of subtitles in the format
                (start_time, end_time, content).

        Yields:
            subtitle_line: A named tuple representing a keyword segment in the format
                (start_time, end_time, lemma, content).

        """
        for (start_time, end_time), content in subtitle_list:
            doc = self.nlp(content.lower())
            for lemma in doc:
                if lemma.lemma_ in self.lemmatized_keywords:
                    console.print(
                        f"Keyword {lemma.lemma_}: {start_time} - {end_time}",
                        style=f"bold {Color.green.value}",
                    )
                    yield subtitle_line(start_time, end_time, lemma, content)
                    break

generate_segments(subtitle_list)

Captures keyword segments from a list of subtitles.

Parameters:

Name Type Description Default
subtitle_list list[tuple[tuple[int, int], str]]

List of subtitles in the format (start_time, end_time, content).

required

Yields:

Name Type Description
subtitle_line Iterable[subtitle_line]

A named tuple representing a keyword segment in the format (start_time, end_time, lemma, content).

Source code in video_sampler/language/keyword_capture.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def generate_segments(
    self, subtitle_list: list[tuple[tuple[int, int], str]]
) -> Iterable[subtitle_line]:
    """
    Captures keyword segments from a list of subtitles.

    Args:
        subtitle_list (list[tuple[tuple[int, int], str]]): List of subtitles in the format
            (start_time, end_time, content).

    Yields:
        subtitle_line: A named tuple representing a keyword segment in the format
            (start_time, end_time, lemma, content).

    """
    for (start_time, end_time), content in subtitle_list:
        doc = self.nlp(content.lower())
        for lemma in doc:
            if lemma.lemma_ in self.lemmatized_keywords:
                console.print(
                    f"Keyword {lemma.lemma_}: {start_time} - {end_time}",
                    style=f"bold {Color.green.value}",
                )
                yield subtitle_line(start_time, end_time, lemma, content)
                break

download_sub(sub_url, max_retries=2)

Download a VTT subtitle file to a string with retry mechanism.

Source code in video_sampler/language/keyword_capture.py
14
15
16
17
18
19
20
21
22
23
def download_sub(sub_url: str, max_retries: int = 2):
    """Download a VTT subtitle file to a string with retry mechanism."""
    for _ in range(max_retries):
        try:
            response = requests.get(url=sub_url)
            response.raise_for_status()
            return parse_srt_subtitle(response.text)
        except RequestException as e:
            console.print(f"Download failed: {str(e)}", style=f"bold {Color.red.value}")
    return None

parse_srt_subtitle(srt_content)

Parse a SRT subtitle file to a list of subtitle segments.

Source code in video_sampler/language/keyword_capture.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def parse_srt_subtitle(srt_content: str) -> list[tuple[tuple[int, int], str]]:
    """Parse a SRT subtitle file to a list of subtitle segments."""
    try:
        import pysrt
    except ImportError as e:
        raise ImportError(
            "To use this feature install pysrt by 'pip install pysrt'"
        ) from e

    subtitle_list = []
    if not srt_content:
        return subtitle_list
    subs = pysrt.from_string(srt_content)
    for sub in subs:
        time = (sub.start.ordinal, sub.end.ordinal)
        content = sub.text
        subtitle_list.append((time, content))
    return subtitle_list