Skip to content

Language

KeywordExtractor

Extracts keywords from subtitles using spaCy.

Parameters:

Name Type Description Default
keywords list[str]

List of keywords to extract.

required

Attributes:

Name Type Description
keywords list[str]

List of keywords to extract.

nlp

spaCy language model for text processing.

lemmatized_keywords set[str]

Set of lemmatized keywords.

Methods:

Name Description
generate_segments

Captures keyword segments from a list of subtitles.

Source code in video_sampler/language/keyword_capture.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class KeywordExtractor:
    """
    Extracts keywords from subtitles using spaCy.

    Args:
        keywords (list[str]): List of keywords to extract.

    Attributes:
        keywords (list[str]): List of keywords to extract.
        nlp: spaCy language model for text processing.
        lemmatized_keywords (set[str]): Set of lemmatized keywords.

    Methods:
        generate_segments: Captures keyword segments from a list of subtitles.

    """

    def __init__(self, keywords: list[str]) -> None:
        self.keywords = keywords
        self.nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
        self.lemmatized_keywords = {
            tok.lemma_ for tok in self.nlp(" ".join(self.keywords))
        }
        console.print(
            f"Keyword capture initialised with: {keywords}",
            style=f"bold {Color.magenta.value}",
        )

    def generate_segments(
        self, subtitle_list: list[tuple[int, int, str]]
    ) -> Iterable[subtitle_line]:
        """
        Captures keyword segments from a list of subtitles.

        Args:
            subtitle_list (list[tuple[int, int, str]]): List of subtitles in the format
                (start_time, end_time, content).

        Yields:
            subtitle_line: A named tuple representing a keyword segment in the format
                (start_time, end_time, lemma, content).

        """
        for (start_time, end_time), content in subtitle_list:
            doc = self.nlp(content.lower())
            for lemma in doc:
                if lemma.lemma_ in self.lemmatized_keywords:
                    console.print(
                        f"Keyword {lemma.lemma_}: {start_time} - {end_time}",
                        style=f"bold {Color.green.value}",
                    )
                    yield subtitle_line(start_time, end_time, lemma, content)
                    break

generate_segments(subtitle_list)

Captures keyword segments from a list of subtitles.

Parameters:

Name Type Description Default
subtitle_list list[tuple[int, int, str]]

List of subtitles in the format (start_time, end_time, content).

required

Yields:

Name Type Description
subtitle_line Iterable[subtitle_line]

A named tuple representing a keyword segment in the format (start_time, end_time, lemma, content).

Source code in video_sampler/language/keyword_capture.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def generate_segments(
    self, subtitle_list: list[tuple[int, int, str]]
) -> Iterable[subtitle_line]:
    """
    Captures keyword segments from a list of subtitles.

    Args:
        subtitle_list (list[tuple[int, int, str]]): List of subtitles in the format
            (start_time, end_time, content).

    Yields:
        subtitle_line: A named tuple representing a keyword segment in the format
            (start_time, end_time, lemma, content).

    """
    for (start_time, end_time), content in subtitle_list:
        doc = self.nlp(content.lower())
        for lemma in doc:
            if lemma.lemma_ in self.lemmatized_keywords:
                console.print(
                    f"Keyword {lemma.lemma_}: {start_time} - {end_time}",
                    style=f"bold {Color.green.value}",
                )
                yield subtitle_line(start_time, end_time, lemma, content)
                break

download_sub(sub_url, max_retries=2)

Download a VTT subtitle file to a string with retry mechanism.

Source code in video_sampler/language/keyword_capture.py
16
17
18
19
20
21
22
23
24
25
def download_sub(sub_url: str, max_retries: int = 2):
    """Download a VTT subtitle file to a string with retry mechanism."""
    for _ in range(max_retries):
        try:
            response = requests.get(url=sub_url)
            response.raise_for_status()
            return parse_srt_subtitle(response.text)
        except RequestException as e:
            console.print(f"Download failed: {str(e)}", style=f"bold {Color.red.value}")
    return None

parse_srt_subtitle(srt_content)

Parse a SRT subtitle file to a list of subtitle segments.

Source code in video_sampler/language/keyword_capture.py
28
29
30
31
32
33
34
35
36
37
38
def parse_srt_subtitle(srt_content):
    """Parse a SRT subtitle file to a list of subtitle segments."""
    subtitle_list = []
    if not srt_content:
        return subtitle_list
    subs = pysrt.from_string(srt_content)
    for sub in subs:
        time = (sub.start.ordinal, sub.end.ordinal)
        content = sub.text
        subtitle_list.append((time, content))
    return subtitle_list