Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch chapter names [if available] for transcript segments #254

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions youtube_transcript_api/_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class YouTubeTranscriptApi(object):
@classmethod
def list_transcripts(cls, video_id, proxies=None, cookies=None):
def list_transcripts(cls, video_id, proxies=None, cookies=None, include_chapter_tags=False):
"""
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
Expand Down Expand Up @@ -68,11 +68,11 @@ def list_transcripts(cls, video_id, proxies=None, cookies=None):
if cookies:
http_client.cookies = cls._load_cookies(cookies, video_id)
http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id)
return TranscriptListFetcher(http_client).fetch(video_id, include_chapter_tags)

@classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
cookies=None, preserve_formatting=False):
cookies=None, preserve_formatting=False, include_chapter_tags=False):
"""
Retrieves the transcripts for a list of videos.

Expand Down Expand Up @@ -102,7 +102,7 @@ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=Fals

for video_id in video_ids:
try:
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting, include_chapter_tags)
except Exception as exception:
if not continue_after_error:
raise exception
Expand All @@ -112,7 +112,8 @@ def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=Fals
return data, unretrievable_videos

@classmethod
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False
, include_chapter_tags=False):
"""
Retrieves the transcript for a single video. This is just a shortcut for calling::

Expand All @@ -134,7 +135,7 @@ def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None,
:rtype [{'text': str, 'start': float, 'end': float}]:
"""
assert isinstance(video_id, str), "`video_id` must be a string"
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
return cls.list_transcripts(video_id, proxies, cookies, include_chapter_tags).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)

@classmethod
def _load_cookies(cls, cookies, video_id):
Expand Down
3 changes: 3 additions & 0 deletions youtube_transcript_api/_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ class TooManyRequests(CouldNotRetrieveTranscript):
class TranscriptsDisabled(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Subtitles are disabled for this video'

class ChaptersNotAvailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'No chapter info available for this video'


class NoTranscriptAvailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'No transcripts are available for this video'
Expand Down
67 changes: 61 additions & 6 deletions youtube_transcript_api/_transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
NoTranscriptAvailable,
FailedToCreateConsentCookie,
InvalidVideoId,
ChaptersNotAvailable
)
from ._settings import WATCH_URL

Expand All @@ -41,12 +42,50 @@ class TranscriptListFetcher(object):
def __init__(self, http_client):
self._http_client = http_client

def fetch(self, video_id):
def fetch(self, video_id, include_chapter_tags=False):
video_html = self._fetch_video_html(video_id)
chapters = None
if include_chapter_tags:
try:
chapters = self._extract_chapters_json(video_html, video_id)
except Exception as e:
pass

return TranscriptList.build(
self._http_client,
video_id,
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
self._extract_captions_json(video_html, video_id),
chapters
)

def _extract_chapters_json(self, html, video_id):
splitted_html = html.split('"chapters":')

if len(splitted_html) <= 1:
if video_id.startswith('http://') or video_id.startswith('https://'):
raise InvalidVideoId(video_id)
if 'class="g-recaptcha"' in html:
raise TooManyRequests(video_id)
if '"playabilityStatus":' not in html:
raise VideoUnavailable(video_id)

raise ChaptersNotAvailable(video_id)

chapters_json = json.loads(
splitted_html[1].split(',"trackingParams":')[0].replace('\n', '')
)
if chapters_json is None:
raise ChaptersNotAvailable(video_id)

chapters_json = [
{
'title': chapter['chapterRenderer']['title']['simpleText'],
'time_range_start_ms': chapter['chapterRenderer']['timeRangeStartMillis'],
'next_time_range_start_ms': chapters_json[i+1]['chapterRenderer']['timeRangeStartMillis'] if i+1 < len(chapters_json) else float('inf'),
} for i, chapter in enumerate(chapters_json)
]

return chapters_json

def _extract_captions_json(self, html, video_id):
splitted_html = html.split('"captions":')
Expand Down Expand Up @@ -117,7 +156,7 @@ def __init__(self, video_id, manually_created_transcripts, generated_transcripts
self._translation_languages = translation_languages

@staticmethod
def build(http_client, video_id, captions_json):
def build(http_client, video_id, captions_json, chapters_json=None):
"""
Factory method for TranscriptList.

Expand Down Expand Up @@ -154,6 +193,7 @@ def build(http_client, video_id, captions_json):
caption['languageCode'],
caption.get('kind', '') == 'asr',
translation_languages if caption.get('isTranslatable', False) else [],
chapters_json
)

return TranscriptList(
Expand Down Expand Up @@ -253,7 +293,7 @@ def _get_language_description(self, transcript_strings):


class Transcript(object):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages, chapters=None):
"""
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList.
Expand All @@ -267,6 +307,7 @@ def __init__(self, http_client, video_id, url, language, language_code, is_gener
:param language_code:
:param is_generated:
:param translation_languages:
:param chapters:
"""
self._http_client = http_client
self.video_id = video_id
Expand All @@ -279,6 +320,7 @@ def __init__(self, http_client, video_id, url, language, language_code, is_gener
translation_language['language_code']: translation_language['language']
for translation_language in translation_languages
}
self.chapters = chapters

def fetch(self, preserve_formatting=False):
"""
Expand All @@ -291,6 +333,7 @@ def fetch(self, preserve_formatting=False):
response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'})
return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text,
self.chapters
)

def __str__(self):
Expand Down Expand Up @@ -348,8 +391,8 @@ def _get_html_regex(self, preserve_formatting):
html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
return html_regex

def parse(self, plain_data):
return [
def parse(self, plain_data, chapters=None):
data = [
{
'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']),
Expand All @@ -358,3 +401,15 @@ def parse(self, plain_data):
for xml_element in ElementTree.fromstring(plain_data)
if xml_element.text is not None
]
if chapters:
for d in data:
start_ms = d['start'] * 1000
end_ms = start_ms + d['duration'] * 1000
d['chapters'] = [c['title'] for c in chapters if
(start_ms >= c['time_range_start_ms'] and start_ms <= c['next_time_range_start_ms'])
or
(end_ms >= c['time_range_start_ms'] and end_ms <= c['next_time_range_start_ms'])
]

return data