jdepoix · t0r0id · Feb 3, 2024
diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py
@@ -16,7 +16,7 @@
 
 class YouTubeTranscriptApi(object):
     @classmethod
-    def list_transcripts(cls, video_id, proxies=None, cookies=None):
+    def list_transcripts(cls, video_id, proxies=None, cookies=None, include_chapter_tags=False):
         """
         Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
         which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@@ -68,11 +68,11 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
             if cookies:
                 http_client.cookies = cls._load_cookies(cookies, video_id)
             http_client.proxies = proxies if proxies else {}
-            return TranscriptListFetcher(http_client).fetch(video_id)
+            return TranscriptListFetcher(http_client).fetch(video_id, include_chapter_tags)
 
     @classmethod
     def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
-                        cookies=None, preserve_formatting=False):
+                        cookies=None, preserve_formatting=False, include_chapter_tags=False):
         """
         Retrieves the transcripts for a list of videos.
 
@@ -102,7 +102,7 @@ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=Fals
 
         for video_id in video_ids:
             try:
-                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
+                data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting, include_chapter_tags)
             except Exception as exception:
                 if not continue_after_error:
                     raise exception
@@ -112,7 +112,8 @@ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=Fals
         return data, unretrievable_videos
 
     @classmethod
-    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
+    def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False
+                       , include_chapter_tags=False):
         """
         Retrieves the transcript for a single video. This is just a shortcut for calling::
 
@@ -134,7 +135,7 @@ def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None,
         :rtype [{'text': str, 'start': float, 'end': float}]:
         """
         assert isinstance(video_id, str), "`video_id` must be a string"
-        return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
+        return cls.list_transcripts(video_id, proxies, cookies, include_chapter_tags).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
 
     @classmethod
     def _load_cookies(cls, cookies, video_id):

diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py
@@ -76,6 +76,9 @@ class TooManyRequests(CouldNotRetrieveTranscript):
 class TranscriptsDisabled(CouldNotRetrieveTranscript):
     CAUSE_MESSAGE = 'Subtitles are disabled for this video'
 
+class ChaptersNotAvailable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'No chapter info available for this video'
+
 
 class NoTranscriptAvailable(CouldNotRetrieveTranscript):
     CAUSE_MESSAGE = 'No transcripts are available for this video'

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
@@ -25,6 +25,7 @@
     NoTranscriptAvailable,
     FailedToCreateConsentCookie,
     InvalidVideoId,
+    ChaptersNotAvailable
 )
 from ._settings import WATCH_URL
 
@@ -41,12 +42,50 @@ class TranscriptListFetcher(object):
     def __init__(self, http_client):
         self._http_client = http_client
 
-    def fetch(self, video_id):
+    def fetch(self, video_id, include_chapter_tags=False):
+        video_html = self._fetch_video_html(video_id)
+        chapters = None
+        if include_chapter_tags:
+            try:
+                chapters = self._extract_chapters_json(video_html, video_id)
+            except Exception as e:
+                pass
+
         return TranscriptList.build(
             self._http_client,
             video_id,
-            self._extract_captions_json(self._fetch_video_html(video_id), video_id),
+            self._extract_captions_json(video_html, video_id),
+            chapters
         )
+
+    def _extract_chapters_json(self, html, video_id):
+        splitted_html = html.split('"chapters":')
+
+        if len(splitted_html) <= 1:
+            if video_id.startswith('http://') or video_id.startswith('https://'):
+                raise InvalidVideoId(video_id)
+            if 'class="g-recaptcha"' in html:
+                raise TooManyRequests(video_id)
+            if '"playabilityStatus":' not in html:
+                raise VideoUnavailable(video_id)
+
+            raise ChaptersNotAvailable(video_id)
+
+        chapters_json = json.loads(
+            splitted_html[1].split(',"trackingParams":')[0].replace('\n', '')
+        )
+        if chapters_json is None:
+            raise ChaptersNotAvailable(video_id)
+
+        chapters_json = [
+            {
+                'title': chapter['chapterRenderer']['title']['simpleText'],
+                'time_range_start_ms': chapter['chapterRenderer']['timeRangeStartMillis'],
+                'next_time_range_start_ms': chapters_json[i+1]['chapterRenderer']['timeRangeStartMillis'] if i+1 < len(chapters_json) else float('inf'),
+            } for i, chapter in enumerate(chapters_json)
+        ]
+
+        return chapters_json
 
     def _extract_captions_json(self, html, video_id):
         splitted_html = html.split('"captions":')
@@ -117,7 +156,7 @@ def __init__(self, video_id, manually_created_transcripts, generated_transcripts
         self._translation_languages = translation_languages
 
     @staticmethod
-    def build(http_client, video_id, captions_json):
+    def build(http_client, video_id, captions_json, chapters_json=None):
         """
         Factory method for TranscriptList.
 
@@ -154,6 +193,7 @@ def build(http_client, video_id, captions_json):
                 caption['languageCode'],
                 caption.get('kind', '') == 'asr',
                 translation_languages if caption.get('isTranslatable', False) else [],
+                chapters_json
             )
 
         return TranscriptList(
@@ -253,7 +293,7 @@ def _get_language_description(self, transcript_strings):
 
 
 class Transcript(object):
-    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
+    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, chapters=None):
         """
         You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
         TranscriptList.
@@ -267,6 +307,7 @@ def __init__(self, http_client, video_id, url, language, language_code, is_gener
         :param language_code:
         :param is_generated:
         :param translation_languages:
+        :param chapters:
         """
         self._http_client = http_client
         self.video_id = video_id
@@ -279,6 +320,7 @@ def __init__(self, http_client, video_id, url, language, language_code, is_gener
             translation_language['language_code']: translation_language['language']
             for translation_language in translation_languages
         }
+        self.chapters = chapters
 
     def fetch(self, preserve_formatting=False):
         """
@@ -291,6 +333,7 @@ def fetch(self, preserve_formatting=False):
         response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
         return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
             _raise_http_errors(response, self.video_id).text,
+            self.chapters
         )
 
     def __str__(self):
@@ -348,8 +391,8 @@ def _get_html_regex(self, preserve_formatting):
             html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
         return html_regex
 
-    def parse(self, plain_data):
-        return [
+    def parse(self, plain_data, chapters=None):
+        data = [
             {
                 'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
                 'start': float(xml_element.attrib['start']),
@@ -358,3 +401,15 @@ def parse(self, plain_data):
             for xml_element in ElementTree.fromstring(plain_data)
             if xml_element.text is not None
         ]
+        if chapters:
+            for d in data:
+                start_ms = d['start'] * 1000
+                end_ms = start_ms + d['duration'] * 1000
+                d['chapters'] = [c['title'] for c in chapters if 
+                                 (start_ms >= c['time_range_start_ms'] and start_ms <= c['next_time_range_start_ms'])
+                                 or 
+                                 (end_ms >= c['time_range_start_ms'] and end_ms <= c['next_time_range_start_ms'])
+                                 ]
+
+        return data
+