Skip to content

Integrations

ImageDescription

A client to interact with the image description API. The API is used to generate short phrases that describe an image.

Methods:

Name Description
summarise_image

Image) -> str: Summarise the image using the LLaMA API.

Source code in video_sampler/integrations/llava_chat.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
class ImageDescription:
    """A client to interact with the image description API.
    The API is used to generate short phrases that describe an image.

    Methods:
        summarise_image(image: Image) -> str:
            Summarise the image using the LLaMA API.
    """

    def __init__(self, url: str) -> None:
        if url is None:
            url = "http://localhost:8080/"
        self.url = url

    def summarise_image(self, image: Image) -> str:
        """Summarise the image
        Args:
            image (Image): The image to summarise.
        Returns:
            str: The description of the image.
        """
        ...

summarise_image(image)

Summarise the image Args: image (Image): The image to summarise. Returns: str: The description of the image.

Source code in video_sampler/integrations/llava_chat.py
63
64
65
66
67
68
69
70
def summarise_image(self, image: Image) -> str:
    """Summarise the image
    Args:
        image (Image): The image to summarise.
    Returns:
        str: The description of the image.
    """
    ...

ImageDescriptionDefault

Bases: ImageDescription

A client to interact with the LLaMA image description API. The API is used to generate short phrases that describe an image.

Methods:

Name Description
summarise_image

Image) -> str: Summarise the image using the LLaMA API.

Source code in video_sampler/integrations/llava_chat.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
class ImageDescriptionDefault(ImageDescription):
    """A client to interact with the LLaMA image description API.
    The API is used to generate short phrases that describe an image.

    Methods:
        summarise_image(image: Image) -> str:
            Summarise the image using the LLaMA API.
    """

    def __init__(self, url: str = "http://localhost:8080/completion"):
        """Initialise the client with the base URL of the LLaMA API.
        Args:
            url (str): The base URL of the LLaMA API.
        """
        """TODO: migrate to OpenAI API when available"""
        super().__init__(url)
        self.headers = {
            "accept-language": "en-GB,en",
            "content-type": "application/json",
        }
        if api_key := os.getenv("OPENAI_API_KEY"):
            self.headers["Authorization"] = f"Bearer {api_key}"
        self.session = requests.Session()

    def get_prompt(self):
        return """You're an AI assistant that describes images using short phrases.
        The image is shown below.
        \nIMAGE:[img-10]
        \nASSISTANT:"""

    def summarise_image(self, image: Image) -> str:
        """Summarise the image using the LLaMA API.
        Args:
            image (Image): The image to summarise.
        Returns:
            str: The description of the image.
        """
        b64image = encode_image(resize_image(image))

        json_body = {
            "model": os.getenv("OPENAI_MODEL", "LLaVA_CPP"),
            "stream": False,
            "n_predict": 1000,
            "temperature": 0.1,
            "repeat_last_n": 78,
            "image_data": [{"data": b64image, "id": 10}],
            "cache_prompt": True,
            "top_k": 40,
            "top_p": 1,
            "min_p": 0.05,
            "tfs_z": 1,
            "typical_p": 1,
            "presence_penalty": 0,
            "frequency_penalty": 0,
            "mirostat": 0,
            "mirostat_tau": 5,
            "mirostat_eta": 0.1,
            "grammar": "",
            "n_probs": 0,
            "min_keep": 0,
            "api_key": os.getenv("OPENAI_API_KEY", ""),
            "slot_id": 0,
            "stop": ["</s>", "Llama:", "User:"],
            "prompt": self.get_prompt(),
        }
        response = self.session.post(
            f"{self.url}",
            json=json_body,
            headers=self.headers,
            stream=False,
        )
        if response.status_code != 200:
            print(f"Failed to summarise image: {response}")
            return None
        res = response.json()
        if "choices" in res:
            return res["choices"][0]["text"].strip()
        elif "content" in res:
            return res["content"].strip()
        raise ValueError(f"Failed to summarise image: unknown response format: {res}")

__init__(url='http://localhost:8080/completion')

Initialise the client with the base URL of the LLaMA API. Args: url (str): The base URL of the LLaMA API.

Source code in video_sampler/integrations/llava_chat.py
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def __init__(self, url: str = "http://localhost:8080/completion"):
    """Initialise the client with the base URL of the LLaMA API.
    Args:
        url (str): The base URL of the LLaMA API.
    """
    """TODO: migrate to OpenAI API when available"""
    super().__init__(url)
    self.headers = {
        "accept-language": "en-GB,en",
        "content-type": "application/json",
    }
    if api_key := os.getenv("OPENAI_API_KEY"):
        self.headers["Authorization"] = f"Bearer {api_key}"
    self.session = requests.Session()

summarise_image(image)

Summarise the image using the LLaMA API. Args: image (Image): The image to summarise. Returns: str: The description of the image.

Source code in video_sampler/integrations/llava_chat.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def summarise_image(self, image: Image) -> str:
    """Summarise the image using the LLaMA API.
    Args:
        image (Image): The image to summarise.
    Returns:
        str: The description of the image.
    """
    b64image = encode_image(resize_image(image))

    json_body = {
        "model": os.getenv("OPENAI_MODEL", "LLaVA_CPP"),
        "stream": False,
        "n_predict": 1000,
        "temperature": 0.1,
        "repeat_last_n": 78,
        "image_data": [{"data": b64image, "id": 10}],
        "cache_prompt": True,
        "top_k": 40,
        "top_p": 1,
        "min_p": 0.05,
        "tfs_z": 1,
        "typical_p": 1,
        "presence_penalty": 0,
        "frequency_penalty": 0,
        "mirostat": 0,
        "mirostat_tau": 5,
        "mirostat_eta": 0.1,
        "grammar": "",
        "n_probs": 0,
        "min_keep": 0,
        "api_key": os.getenv("OPENAI_API_KEY", ""),
        "slot_id": 0,
        "stop": ["</s>", "Llama:", "User:"],
        "prompt": self.get_prompt(),
    }
    response = self.session.post(
        f"{self.url}",
        json=json_body,
        headers=self.headers,
        stream=False,
    )
    if response.status_code != 200:
        print(f"Failed to summarise image: {response}")
        return None
    res = response.json()
    if "choices" in res:
        return res["choices"][0]["text"].strip()
    elif "content" in res:
        return res["content"].strip()
    raise ValueError(f"Failed to summarise image: unknown response format: {res}")

VideoSummary

A client to interact with the LLaMA video summarisation API. The API is used to generate a summary of a video based on image descriptions.

Methods:

Name Description
summarise_video

list[str]) -> str: Summarise the video using the LLaMA API.

Source code in video_sampler/integrations/llava_chat.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
class VideoSummary:
    """A client to interact with the LLaMA video summarisation API.
    The API is used to generate a summary of a video based on image descriptions.

    Methods:
        summarise_video(image_descriptions: list[str]) -> str:
            Summarise the video using the LLaMA API.
    """

    def __init__(self, url: str = "http://localhost:8080/v1"):
        """Initialise the client with the base URL of the LLaMA API.
        Args:
            url (str): The base URL of the LLaMA API."""
        if url is None:
            url = "http://localhost:8080/v1"
        super().__init__(url)

    def get_prompt(self):
        return """You're an AI assistant that summarises videos based on image descriptions.
        Combine image descriptions into a coherent summary of the video."""

    def summarise_video(self, image_descriptions: list[str]):
        """Summarise the video using the LLaMA API.
        Args:
            image_descriptions (list[str]): The descriptions of the images in the video.
        Returns:
            str: The summary of the video.
        """
        return self.client.chat.completions.create(
            model="LLaMA_CPP",
            messages=[
                {
                    "role": "system",
                    "content": self.get_prompt(),
                },
                {"role": "user", "content": "\n".join(image_descriptions)},
            ],
            max_tokens=300,
        )

__init__(url='http://localhost:8080/v1')

Initialise the client with the base URL of the LLaMA API. Args: url (str): The base URL of the LLaMA API.

Source code in video_sampler/integrations/llava_chat.py
198
199
200
201
202
203
204
def __init__(self, url: str = "http://localhost:8080/v1"):
    """Initialise the client with the base URL of the LLaMA API.
    Args:
        url (str): The base URL of the LLaMA API."""
    if url is None:
        url = "http://localhost:8080/v1"
    super().__init__(url)

summarise_video(image_descriptions)

Summarise the video using the LLaMA API. Args: image_descriptions (list[str]): The descriptions of the images in the video. Returns: str: The summary of the video.

Source code in video_sampler/integrations/llava_chat.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def summarise_video(self, image_descriptions: list[str]):
    """Summarise the video using the LLaMA API.
    Args:
        image_descriptions (list[str]): The descriptions of the images in the video.
    Returns:
        str: The summary of the video.
    """
    return self.client.chat.completions.create(
        model="LLaMA_CPP",
        messages=[
            {
                "role": "system",
                "content": self.get_prompt(),
            },
            {"role": "user", "content": "\n".join(image_descriptions)},
        ],
        max_tokens=300,
    )

encode_image(image)

Convert the image to base64

Source code in video_sampler/integrations/llava_chat.py
37
38
39
40
41
42
43
44
45
46
def encode_image(image: Image):
    """
    Convert the image to base64
    """
    # create a buffer to store the image
    buffer = io.BytesIO()
    # save the image to the buffer
    image.save(buffer, format="JPEG")
    # convert the image to base64
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

resize_image(image, max_side=512)

Resize the image to max_side if any of the sides is greater than max_side. If max_side is None, the image is returned as is.

Source code in video_sampler/integrations/llava_chat.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def resize_image(image: Image, max_side: int = 512):
    """
    Resize the image to max_side if any of the sides is greater than max_side.
    If max_side is None, the image is returned as is.
    """
    # get the image shape
    if max_side is None:
        return image
    width, height = image.size
    if max(width, height) > max_side:
        # resize the image to max_side
        # keeping the aspect ratio
        if width > height:
            new_width = max_side
            new_height = int(height * max_side / width)
        else:
            new_height = max_side
            new_width = int(width * max_side / height)
        return image.resize((new_width, new_height))
    return image