From b6489e718cc5f640c71213a6ff5e886faa582af8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Chris=20Ch=C3=A1vez?= <xnpiochv@gmail.com>
Date: Fri, 21 Feb 2025 13:33:13 -0500
Subject: [PATCH 1/3] feat: Video editor supports transcripts [FC-0076]
 (#36058)

* Add error handler on save video to avoid creating sjson
* Support transcripts without edx_video_id in definition_to_xml
* When copying a video from a library to a course: Create a new edx_video_id
* Save transcripts as static assets in a video in a library when adding a new transcript.
* Delete transcripts as static assets in a video in a library when deleting transcripts.
* Support download transcript in a video in a library.
* Support replace transcript in a video in a library.
* Support updating transcripts in video in a library.
* Refactor the code of downloading YouTube transcripts to enable this feature in libraries.
* Support copy from a library to a course and a course to a library.
---
 cms/djangoapps/contentstore/helpers.py        |  69 ++++-
 .../rest_api/v2/views/downstreams.py          |   4 +
 .../v2/views/tests/test_downstreams.py        |   4 +-
 .../views/tests/test_transcripts.py           |  40 ++-
 .../contentstore/views/transcripts_ajax.py    | 242 ++++++++++++----
 cms/lib/xblock/test/test_upstream_sync.py     |  28 ++
 cms/lib/xblock/upstream_sync.py               |   4 +
 .../core/djangoapps/xblock/rest_api/views.py  |   2 +-
 xmodule/tests/test_video.py                   |  42 +++
 xmodule/video_block/transcripts_utils.py      |  41 ++-
 xmodule/video_block/video_block.py            |  10 +-
 xmodule/video_block/video_handlers.py         | 262 +++++++++++-------
 12 files changed, 569 insertions(+), 179 deletions(-)

diff --git a/cms/djangoapps/contentstore/helpers.py b/cms/djangoapps/contentstore/helpers.py
index 1ceb2bc3c01d..59bd6b56277d 100644
--- a/cms/djangoapps/contentstore/helpers.py
+++ b/cms/djangoapps/contentstore/helpers.py
@@ -10,6 +10,7 @@
 import re
 
 from attrs import frozen, Factory
+from django.core.files.base import ContentFile
 from django.conf import settings
 from django.contrib.auth import get_user_model
 from django.utils.translation import gettext as _
@@ -23,6 +24,11 @@
 from xmodule.exceptions import NotFoundError
 from xmodule.modulestore.django import modulestore
 from xmodule.xml_block import XmlMixin
+from xmodule.video_block.transcripts_utils import Transcript, build_components_import_path
+from edxval.api import (
+    create_external_video,
+    create_or_update_video_transcript,
+)
 
 from cms.djangoapps.models.settings.course_grading import CourseGradingModel
 from cms.lib.xblock.upstream_sync import UpstreamLink, UpstreamLinkException, fetch_customizable_fields
@@ -274,8 +280,14 @@ def _insert_static_files_into_downstream_xblock(
         course_key=downstream_xblock.context_key,
         staged_content_id=staged_content_id,
         static_files=static_files,
-        usage_key=downstream_xblock.scope_ids.usage_id,
+        usage_key=downstream_xblock.usage_key,
     )
+    if downstream_xblock.usage_key.block_type == 'video':
+        _import_transcripts(
+            downstream_xblock,
+            staged_content_id=staged_content_id,
+            static_files=static_files,
+        )
 
     # Rewrite the OLX's static asset references to point to the new
     # locations for those assets. See _import_files_into_course for more
@@ -331,6 +343,13 @@ def import_staged_content_from_user_clipboard(parent_key: UsageKey, request) ->
             tags=user_clipboard.content.tags,
         )
 
+        usage_key = new_xblock.usage_key
+        if usage_key.block_type == 'video':
+            # The edx_video_id must always be new so as not
+            # to interfere with the data of the copied block
+            new_xblock.edx_video_id = create_external_video(display_name='external video')
+            store.update_item(new_xblock, request.user.id)
+
         notices = _insert_static_files_into_downstream_xblock(new_xblock, user_clipboard.content.id, request)
 
     return new_xblock, notices
@@ -630,8 +649,8 @@ def _import_file_into_course(
     # we're not going to attempt to change.
     if clipboard_file_path.startswith('static/'):
         # If it's in this form, it came from a library and assumes component-local assets
-        file_path = clipboard_file_path.lstrip('static/')
-        import_path = f"components/{usage_key.block_type}/{usage_key.block_id}/{file_path}"
+        file_path = clipboard_file_path.removeprefix('static/')
+        import_path = build_components_import_path(usage_key, file_path)
         filename = pathlib.Path(file_path).name
         new_key = course_key.make_asset_key("asset", import_path.replace("/", "_"))
     else:
@@ -672,6 +691,50 @@ def _import_file_into_course(
         return False, {}
 
 
+def _import_transcripts(
+    block: XBlock,
+    staged_content_id: int,
+    static_files: list[content_staging_api.StagedContentFileData],
+):
+    """
+    Adds transcripts to VAL using the new edx_video_id.
+    """
+    for file_data_obj in static_files:
+        clipboard_file_path = file_data_obj.filename
+        data = content_staging_api.get_staged_content_static_file_data(
+            staged_content_id,
+            clipboard_file_path
+        )
+        if data is None:
+            raise NotFoundError(file_data_obj.source_key)
+
+        if clipboard_file_path.startswith('static/'):
+            # If it's in this form, it came from a library and assumes component-local assets
+            file_path = clipboard_file_path.removeprefix('static/')
+        else:
+            # Otherwise it came from a course...
+            file_path = clipboard_file_path
+
+        filename = pathlib.Path(file_path).name
+
+        language_code = next((k for k, v in block.transcripts.items() if v == filename), None)
+        if language_code:
+            sjson_subs = Transcript.convert(
+                content=data,
+                input_format=Transcript.SRT,
+                output_format=Transcript.SJSON
+            ).encode()
+            create_or_update_video_transcript(
+                video_id=block.edx_video_id,
+                language_code=language_code,
+                metadata={
+                    'file_format': Transcript.SJSON,
+                    'language_code': language_code
+                },
+                file_data=ContentFile(sjson_subs),
+            )
+
+
 def is_item_in_course_tree(item):
     """
     Check that the item is in the course tree.
diff --git a/cms/djangoapps/contentstore/rest_api/v2/views/downstreams.py b/cms/djangoapps/contentstore/rest_api/v2/views/downstreams.py
index 94931acc8ff2..459070de3c63 100644
--- a/cms/djangoapps/contentstore/rest_api/v2/views/downstreams.py
+++ b/cms/djangoapps/contentstore/rest_api/v2/views/downstreams.py
@@ -88,6 +88,7 @@
     DeveloperErrorViewMixin,
     view_auth_classes,
 )
+from xmodule.video_block.transcripts_utils import clear_transcripts
 from xmodule.modulestore.django import modulestore
 from xmodule.modulestore.exceptions import ItemNotFoundError
 
@@ -224,6 +225,9 @@ def post(self, request: _AuthenticatedRequest, usage_key_string: str) -> Respons
         """
         downstream = _load_accessible_block(request.user, usage_key_string, require_write_access=True)
         try:
+            if downstream.usage_key.block_type == "video":
+                # Delete all transcripts so we can copy new ones from upstream
+                clear_transcripts(downstream)
             upstream = sync_from_upstream(downstream, request.user)
             static_file_notices = import_static_assets_for_library_sync(downstream, upstream, request)
         except UpstreamLinkException as exc:
diff --git a/cms/djangoapps/contentstore/rest_api/v2/views/tests/test_downstreams.py b/cms/djangoapps/contentstore/rest_api/v2/views/tests/test_downstreams.py
index ec7c842d1637..30d90457afec 100644
--- a/cms/djangoapps/contentstore/rest_api/v2/views/tests/test_downstreams.py
+++ b/cms/djangoapps/contentstore/rest_api/v2/views/tests/test_downstreams.py
@@ -264,7 +264,8 @@ def call_api(self, usage_key_string):
     @patch.object(UpstreamLink, "get_for_block", _get_upstream_link_good_and_syncable)
     @patch.object(downstreams_views, "sync_from_upstream")
     @patch.object(downstreams_views, "import_static_assets_for_library_sync", return_value=StaticFileNotices())
-    def test_200(self, mock_sync_from_upstream, mock_import_staged_content):
+    @patch.object(downstreams_views, "clear_transcripts")
+    def test_200(self, mock_sync_from_upstream, mock_import_staged_content, mock_clear_transcripts):
         """
         Does the happy path work?
         """
@@ -273,6 +274,7 @@ def test_200(self, mock_sync_from_upstream, mock_import_staged_content):
         assert response.status_code == 200
         assert mock_sync_from_upstream.call_count == 1
         assert mock_import_staged_content.call_count == 1
+        assert mock_clear_transcripts.call_count == 1
 
 
 class DeleteDownstreamSyncViewtest(_DownstreamSyncViewTestMixin, SharedModuleStoreTestCase):
diff --git a/cms/djangoapps/contentstore/views/tests/test_transcripts.py b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
index 95fbaccbab7c..61c0bd81b9c8 100644
--- a/cms/djangoapps/contentstore/views/tests/test_transcripts.py
+++ b/cms/djangoapps/contentstore/views/tests/test_transcripts.py
@@ -15,9 +15,11 @@
 from django.urls import reverse
 from edxval.api import create_video
 from opaque_keys.edx.keys import UsageKey
+from organizations.tests.factories import OrganizationFactory
 
 from cms.djangoapps.contentstore.tests.utils import CourseTestCase, setup_caption_responses
 from openedx.core.djangoapps.contentserver.caching import del_cached_content
+from openedx.core.djangoapps.content_libraries import api as lib_api
 from xmodule.contentstore.content import StaticContent  # lint-amnesty, pylint: disable=wrong-import-order
 from xmodule.contentstore.django import contentstore  # lint-amnesty, pylint: disable=wrong-import-order
 from xmodule.exceptions import NotFoundError  # lint-amnesty, pylint: disable=wrong-import-order
@@ -27,8 +29,10 @@
     GetTranscriptsFromYouTubeException,
     Transcript,
     get_video_transcript_content,
-    remove_subs_from_store
+    get_transcript,
+    remove_subs_from_store,
 )
+from openedx.core.djangoapps.xblock import api as xblock_api
 
 TEST_DATA_CONTENTSTORE = copy.deepcopy(settings.CONTENTSTORE)
 TEST_DATA_CONTENTSTORE['DOC_STORE_CONFIG']['db'] = 'test_xcontent_%s' % uuid4().hex
@@ -92,6 +96,21 @@ def setUp(self):
         resp = self.client.ajax_post('/xblock/', data)
         self.assertEqual(resp.status_code, 200)
 
+        self.library = lib_api.create_library(
+            org=OrganizationFactory.create(short_name="org1"),
+            slug="lib",
+            title="Library",
+        )
+        self.library_block_metadata = lib_api.create_library_block(
+            self.library.key,
+            "video",
+            "video-transcript",
+        )
+        self.library_block = xblock_api.load_block(
+            self.library_block_metadata.usage_key,
+            self.user,
+        )
+
         self.video_usage_key = self._get_usage_key(resp)
         self.item = modulestore().get_item(self.video_usage_key)
         # hI10vDNYz4M - valid Youtube ID with transcripts.
@@ -702,6 +721,25 @@ def test_replace_transcript_success(self, edx_video_id):
         expected_sjson_content = json.loads(SJSON_TRANSCRIPT_CONTENT)
         self.assertDictEqual(actual_sjson_content, expected_sjson_content)
 
+    def test_replace_transcript_library_content_success(self):
+        # Make call to replace transcripts from youtube
+        response = self.replace_transcript(self.library_block_metadata.usage_key, self.youtube_id)
+
+        # Verify the response
+        self.assert_response(response, expected_status_code=200, expected_message='Success')
+
+        # Obtain updated block
+        updated_block = xblock_api.load_block(
+            self.library_block_metadata.usage_key,
+            self.user,
+        )
+
+        # Verify transcript content
+        transcript = get_transcript(updated_block, 'en', Transcript.SJSON)
+        actual_sjson_content = json.loads(transcript[0])
+        expected_sjson_content = json.loads(SJSON_TRANSCRIPT_CONTENT)
+        self.assertDictEqual(actual_sjson_content, expected_sjson_content)
+
     def test_replace_transcript_fails_without_data(self):
         """
         Verify that replace transcript fails if we do not provide video data in request.
diff --git a/cms/djangoapps/contentstore/views/transcripts_ajax.py b/cms/djangoapps/contentstore/views/transcripts_ajax.py
index 8cb7f455013b..9b667d9d67fa 100644
--- a/cms/djangoapps/contentstore/views/transcripts_ajax.py
+++ b/cms/djangoapps/contentstore/views/transcripts_ajax.py
@@ -19,7 +19,8 @@
 from django.utils.translation import gettext as _
 from edxval.api import create_external_video, create_or_update_video_transcript
 from opaque_keys import InvalidKeyError
-from opaque_keys.edx.keys import UsageKey
+from opaque_keys.edx.keys import UsageKey, UsageKeyV2
+from opaque_keys.edx.locator import LibraryLocatorV2
 
 from cms.djangoapps.contentstore.video_storage_handlers import TranscriptProvider
 from common.djangoapps.student.auth import has_course_author_access
@@ -43,6 +44,9 @@
     get_transcript_link_from_youtube,
     get_transcript_links_from_youtube,
 )
+from openedx.core.djangoapps.content_libraries import api as lib_api
+from openedx.core.djangoapps.xblock import api as xblock_api
+from openedx.core.djangoapps.xblock.data import CheckPerm
 
 __all__ = [
     'upload_transcripts',
@@ -87,6 +91,49 @@ def link_video_to_component(video_component, user):
     return edx_video_id
 
 
+def save_video_transcript_in_learning_core(
+    usage_key,
+    input_format,
+    transcript_content,
+    language_code
+):
+    """
+    Saves a video transcript to the learning core.
+
+    Learning Core uses the standard `.srt` format for subtitles.
+    Note: SJSON is an edx-specific format that we're trying to move away from,
+    so for all new stuff related to Learning Core should only use `.srt`.
+
+    Arguments:
+        usage_key: UsageKey of the block
+        input_format: Input transcript format for content being passed.
+        transcript_content: Content of the transcript file
+        language_code: transcript language code
+
+    Returns:
+        result: A boolean indicating whether the transcript was saved or not.
+        video_key: Key used in video filename
+    """
+    try:
+        srt_content = Transcript.convert(
+            content=transcript_content,
+            input_format=input_format,
+            output_format=Transcript.SRT
+        ).encode()
+
+        filename = f"static/transcript-{language_code}.srt"
+        lib_api.add_library_block_static_asset_file(
+            usage_key,
+            filename,
+            srt_content,
+        )
+        result = True
+    except (TranscriptsGenerationException, UnicodeDecodeError):
+        result = False
+
+    return result
+
+
 def save_video_transcript(edx_video_id, input_format, transcript_content, language_code):
     """
     Saves a video transcript to the VAL and its content to the configured django storage(DS).
@@ -118,6 +165,7 @@ def save_video_transcript(edx_video_id, input_format, transcript_content, langua
             },
             file_data=ContentFile(sjson_subs),
         )
+
         result = True
     except (TranscriptsGenerationException, UnicodeDecodeError):
         result = False
@@ -145,6 +193,7 @@ def validate_video_block(request, locator):
         item = _get_item(request, {'locator': locator})
         if item.category != 'video':
             error = _('Transcripts are supported only for "video" blocks.')
+
     except (InvalidKeyError, ItemNotFoundError):
         error = _('Cannot find item by locator.')
 
@@ -319,61 +368,38 @@ def check_transcripts(request):  # lint-amnesty, pylint: disable=too-many-statem
         get_transcript_from_val(edx_video_id=edx_video_id, lang='en')
         command = 'found'
     except NotFoundError:
-        filename = f'subs_{item.sub}.srt.sjson'
-        content_location = StaticContent.compute_location(item.location.course_key, filename)
-        try:
-            local_transcripts = contentstore().find(content_location).data.decode('utf-8')
-            transcripts_presence['current_item_subs'] = item.sub
-        except NotFoundError:
-            pass
-
         # Check for youtube transcripts presence
         youtube_id = videos.get('youtube', None)
         if youtube_id:
-            transcripts_presence['is_youtube_mode'] = True
+            _check_youtube_transcripts(
+                transcripts_presence,
+                youtube_id,
+                item,
+            )
 
-            # youtube local
-            filename = f'subs_{youtube_id}.srt.sjson'
+        if not isinstance(item.usage_key, UsageKeyV2):
+            filename = f'subs_{item.sub}.srt.sjson'
             content_location = StaticContent.compute_location(item.location.course_key, filename)
             try:
-                local_transcripts = contentstore().find(content_location).data.decode('utf-8')
-                transcripts_presence['youtube_local'] = True
+                contentstore().find(content_location).data.decode('utf-8')
+                transcripts_presence['current_item_subs'] = item.sub
             except NotFoundError:
-                log.debug("Can't find transcripts in storage for youtube id: %s", youtube_id)
+                pass
 
-            if get_transcript_link_from_youtube(youtube_id):
-                transcripts_presence['youtube_server'] = True
-            #check youtube local and server transcripts for equality
-            if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']:
+            # Check for html5 local transcripts presence
+            html5_subs = []
+            for html5_id in videos['html5']:
+                filename = f'subs_{html5_id}.srt.sjson'
+                content_location = StaticContent.compute_location(item.location.course_key, filename)
                 try:
-                    transcript_links = get_transcript_links_from_youtube(
-                        youtube_id,
-                        settings,
-                        item.runtime.service(item, "i18n")
+                    html5_subs.append(contentstore().find(content_location).data)
+                    transcripts_presence['html5_local'].append(html5_id)
+                except NotFoundError:
+                    log.debug("Can't find transcripts in storage for non-youtube video_id: %s", html5_id)
+                if len(html5_subs) == 2:  # check html5 transcripts for equality
+                    transcripts_presence['html5_equal'] = (
+                        json.loads(html5_subs[0].decode('utf-8')) == json.loads(html5_subs[1].decode('utf-8'))
                     )
-                    for (_, link) in transcript_links.items():
-                        youtube_server_subs = get_transcript_from_youtube(
-                            link, youtube_id, item.runtime.service(item, "i18n")
-                        )
-                        if json.loads(local_transcripts) == youtube_server_subs:  # check transcripts for equality
-                            transcripts_presence['youtube_diff'] = False
-                except GetTranscriptsFromYouTubeException:
-                    pass
-
-        # Check for html5 local transcripts presence
-        html5_subs = []
-        for html5_id in videos['html5']:
-            filename = f'subs_{html5_id}.srt.sjson'
-            content_location = StaticContent.compute_location(item.location.course_key, filename)
-            try:
-                html5_subs.append(contentstore().find(content_location).data)
-                transcripts_presence['html5_local'].append(html5_id)
-            except NotFoundError:
-                log.debug("Can't find transcripts in storage for non-youtube video_id: %s", html5_id)
-            if len(html5_subs) == 2:  # check html5 transcripts for equality
-                transcripts_presence['html5_equal'] = (
-                    json.loads(html5_subs[0].decode('utf-8')) == json.loads(html5_subs[1].decode('utf-8'))
-                )
 
         command, __ = _transcripts_logic(transcripts_presence, videos)
 
@@ -381,6 +407,43 @@ def check_transcripts(request):  # lint-amnesty, pylint: disable=too-many-statem
     return JsonResponse(transcripts_presence)
 
 
+def _check_youtube_transcripts(transcripts_presence, youtube_id, item):
+    """
+    Check for youtube transcripts presence
+    """
+    transcripts_presence['is_youtube_mode'] = True
+
+    if get_transcript_link_from_youtube(youtube_id):
+        transcripts_presence['youtube_server'] = True
+
+    if not isinstance(item.usage_key, UsageKeyV2):
+        # youtube local
+        filename = f'subs_{youtube_id}.srt.sjson'
+        content_location = StaticContent.compute_location(item.location.course_key, filename)
+        try:
+            local_transcripts = contentstore().find(content_location).data.decode('utf-8')
+            transcripts_presence['youtube_local'] = True
+        except NotFoundError:
+            log.debug("Can't find transcripts in storage for youtube id: %s", youtube_id)
+
+    # check youtube local and server transcripts for equality
+    if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']:
+        try:
+            transcript_links = get_transcript_links_from_youtube(
+                youtube_id,
+                settings,
+                item.runtime.service(item, "i18n")
+            )
+            for (_, link) in transcript_links.items():
+                youtube_server_subs = get_transcript_from_youtube(
+                    link, youtube_id, item.runtime.service(item, "i18n")
+                )
+                if json.loads(local_transcripts) == youtube_server_subs:  # check transcripts for equality
+                    transcripts_presence['youtube_diff'] = False
+        except GetTranscriptsFromYouTubeException:
+            pass
+
+
 def _transcripts_logic(transcripts_presence, videos):
     """
     By `transcripts_presence` content, figure what show to user:
@@ -447,7 +510,7 @@ def _validate_transcripts_data(request):
 
         data: dict, loaded json from request,
         videos: parsed `data` to useful format,
-        item:  video item from storage
+        item:  video item from storage or library
 
     Raises `TranscriptsRequestValidationException` if validation is unsuccessful
     or `PermissionDenied` if user has no access.
@@ -529,6 +592,7 @@ def choose_transcripts(request):
         Or error in case of validation failures.
     """
     error, validated_data = validate_transcripts_request(request, include_html5=True)
+    edx_video_id = None
     if error:
         response = error_response({}, error)
     else:
@@ -546,10 +610,24 @@ def choose_transcripts(request):
             return error_response({}, _('No such transcript.'))
 
         # 2. Link a video to video component if its not already linked to one.
-        edx_video_id = link_video_to_component(video, request.user)
+        if not isinstance(video.usage_key.context_key, LibraryLocatorV2):
+            edx_video_id = link_video_to_component(video, request.user)
 
         # 3. Upload the retrieved transcript to DS for the linked video ID.
-        success = save_video_transcript(edx_video_id, input_format, transcript_content, language_code='en')
+        if isinstance(video.usage_key.context_key, LibraryLocatorV2):
+            success = save_video_transcript_in_learning_core(
+                video.usage_key,
+                input_format,
+                transcript_content,
+                language_code='en',
+            )
+        else:
+            success = save_video_transcript(
+                edx_video_id,
+                input_format,
+                transcript_content,
+                language_code='en',
+            )
         if success:
             response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200)
         else:
@@ -569,6 +647,7 @@ def rename_transcripts(request):
         Or error in case of validation failures.
     """
     error, validated_data = validate_transcripts_request(request)
+    edx_video_id = None
     if error:
         response = error_response({}, error)
     else:
@@ -585,10 +664,24 @@ def rename_transcripts(request):
             return error_response({}, _('No such transcript.'))
 
         # 2. Link a video to video component if its not already linked to one.
-        edx_video_id = link_video_to_component(video, request.user)
+        if not isinstance(video.usage_key.context_key, LibraryLocatorV2):
+            edx_video_id = link_video_to_component(video, request.user)
 
         # 3. Upload the retrieved transcript to DS for the linked video ID.
-        success = save_video_transcript(edx_video_id, input_format, transcript_content, language_code='en')
+        if isinstance(video.usage_key.context_key, LibraryLocatorV2):
+            success = save_video_transcript_in_learning_core(
+                video.usage_key,
+                input_format,
+                transcript_content,
+                language_code='en',
+            )
+        else:
+            success = save_video_transcript(
+                edx_video_id,
+                input_format,
+                transcript_content,
+                language_code='en',
+            )
         if success:
             response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200)
         else:
@@ -610,6 +703,7 @@ def replace_transcripts(request):
     """
     error, validated_data = validate_transcripts_request(request, include_yt=True)
     youtube_id = validated_data['youtube']
+    edx_video_id = None
     if error:
         response = error_response({}, error)
     elif not youtube_id:
@@ -623,16 +717,34 @@ def replace_transcripts(request):
             return error_response({}, str(e))
 
         # 2. Link a video to video component if its not already linked to one.
-        edx_video_id = link_video_to_component(video, request.user)
-
-        # for transcript in transcript_links:
+        if not isinstance(video.usage_key.context_key, LibraryLocatorV2):
+            edx_video_id = link_video_to_component(video, request.user)
 
         # 3. Upload YT transcript to DS for the linked video ID.
         success = True
         for transcript in transcript_content:
             [language_code, json_content] = transcript
-            success = save_video_transcript(edx_video_id, Transcript.SJSON, json_content, language_code)
+            if isinstance(video.usage_key.context_key, LibraryLocatorV2):
+                success = save_video_transcript_in_learning_core(
+                    video.usage_key,
+                    Transcript.SJSON,
+                    json_content,
+                    language_code,
+                )
+                filename = f"transcript-{language_code}.srt"
+            else:
+                success = save_video_transcript(
+                    edx_video_id,
+                    Transcript.SJSON,
+                    json_content,
+                    language_code,
+                )
+                filename = f"{edx_video_id}-{language_code}.srt"
+            if not success:
+                break
+            video.transcripts[language_code] = filename
         if success:
+            video.save()
             response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200)
         else:
             response = error_response({}, _('There is a problem with the YouTube transcript file.'))
@@ -643,17 +755,25 @@ def replace_transcripts(request):
 def _get_item(request, data):
     """
     Obtains from 'data' the locator for an item.
-    Next, gets that item from the modulestore (allowing any errors to raise up).
+    Next, gets that item from the modulestore (allowing any errors to raise up)
+    or from library API if is a library content.
     Finally, verifies that the user has access to the item.
 
-    Returns the item.
+    Returns the item and a boolean if is a library content.
     """
     usage_key = UsageKey.from_string(data.get('locator'))
-    if not usage_key.context_key.is_course:
-        # TODO: implement transcript support for learning core / content libraries.
-        raise TranscriptsRequestValidationException(_('Transcripts are not yet supported in content libraries.'))
+
+    context_key = usage_key.context_key
+    if not context_key.is_course:
+        if isinstance(context_key, LibraryLocatorV2):
+            return xblock_api.load_block(
+                usage_key,
+                request.user,
+                check_permission=CheckPerm.CAN_EDIT,
+            )
+        raise TranscriptsRequestValidationException(_('Transcripts are not yet supported for this type of block'))
     # This is placed before has_course_author_access() to validate the location,
-    # because has_course_author_access() raises  r if location is invalid.
+    # because has_course_author_access() raises error if location is invalid.
     item = modulestore().get_item(usage_key)
 
     # use the item's course_key, because the usage_key might not have the run
diff --git a/cms/lib/xblock/test/test_upstream_sync.py b/cms/lib/xblock/test/test_upstream_sync.py
index 2f8f77ab6560..71fa7d51bb9d 100644
--- a/cms/lib/xblock/test/test_upstream_sync.py
+++ b/cms/lib/xblock/test/test_upstream_sync.py
@@ -71,6 +71,21 @@ def setUp(self):
             '/>\n'
         ))
 
+        self.upstream_video_key = libs.create_library_block(self.library.key, "video", "video-upstream").usage_key
+        libs.set_library_block_olx(self.upstream_video_key, (
+            '<video'
+            ' display_name="Video Test"'
+            ' edx_video_id=""'
+            ' end_time="00:00:00"'
+            ' html5_sources="[&quot;https://www.sample-videos.com/video321/mp4/720/big_buck_bunny_720p_2mb.mp4&quot;]"'
+            ' start_time="00:00:00"'
+            ' track=""'
+            ' youtube_id_1_0=""'
+            '>'
+            ' <source src="https://www.sample-videos.com/video321/mp4/720/big_buck_bunny_720p_2mb.mp4"/>'
+            '</video>'
+        ))
+
         libs.publish_changes(self.library.key, self.user.id)
 
         self.taxonomy_all_org = tagging_api.create_taxonomy(
@@ -539,3 +554,16 @@ def test_sync_library_block_tags(self):
         assert len(object_tags) == len(new_upstream_tags)
         for object_tag in object_tags:
             assert object_tag.value in new_upstream_tags
+
+    def test_sync_video_block(self):
+        downstream = BlockFactory.create(category='video', parent=self.unit, upstream=str(self.upstream_video_key))
+        downstream.edx_video_id = "test_video_id"
+
+        # Sync
+        sync_from_upstream(downstream, self.user)
+        assert downstream.upstream_version == 2
+        assert downstream.upstream_display_name == "Video Test"
+        assert downstream.display_name == "Video Test"
+
+        # `edx_video_id` doesn't change
+        assert downstream.edx_video_id == "test_video_id"
diff --git a/cms/lib/xblock/upstream_sync.py b/cms/lib/xblock/upstream_sync.py
index 8a12ea8fc045..22cad3c6d36a 100644
--- a/cms/lib/xblock/upstream_sync.py
+++ b/cms/lib/xblock/upstream_sync.py
@@ -297,7 +297,11 @@ def _update_non_customizable_fields(*, upstream: XBlock, downstream: XBlock) ->
     """
     syncable_fields = _get_synchronizable_fields(upstream, downstream)
     customizable_fields = set(downstream.get_customizable_fields().keys())
+    isVideoBlock = downstream.usage_key.block_type == "video"
     for field_name in syncable_fields - customizable_fields:
+        if isVideoBlock and field_name == 'edx_video_id':
+            # Avoid overwriting edx_video_id between blocks
+            continue
         new_upstream_value = getattr(upstream, field_name)
         setattr(downstream, field_name, new_upstream_value)
 
diff --git a/openedx/core/djangoapps/xblock/rest_api/views.py b/openedx/core/djangoapps/xblock/rest_api/views.py
index 0fd202b4eff2..273bc3ffec3e 100644
--- a/openedx/core/djangoapps/xblock/rest_api/views.py
+++ b/openedx/core/djangoapps/xblock/rest_api/views.py
@@ -178,7 +178,7 @@ def xblock_handler(
     """
     # To support sandboxed XBlocks, custom frontends, and other use cases, we
     # authenticate requests using a secure token in the URL. see
-    # openedx.core.djangoapps.xblock.utils.get_secure_hash_for_xblock_handler
+    # openedx.core.djangoapps.xblock.utils.get_secure_token_for_xblock_handler
     # for details and rationale.
     if not validate_secure_token_for_xblock_handler(user_id, str(usage_key), secure_token):
         raise PermissionDenied("Invalid/expired auth token.")
diff --git a/xmodule/tests/test_video.py b/xmodule/tests/test_video.py
index 5e95f77082b1..e98c78244db5 100644
--- a/xmodule/tests/test_video.py
+++ b/xmodule/tests/test_video.py
@@ -741,6 +741,48 @@ def test_export_to_xml(self, mock_val_api):
             course_id=self.block.scope_ids.usage_id.context_key,
         )
 
+    def test_export_to_xml_without_video_id(self):
+        """
+        Test that we write the correct XML on export of a video without edx_video_id.
+        """
+        self.block.youtube_id_0_75 = 'izygArpw-Qo'
+        self.block.youtube_id_1_0 = 'p2Q6BrNhdh8'
+        self.block.youtube_id_1_25 = '1EeWXzPdhSA'
+        self.block.youtube_id_1_5 = 'rABDYkeK0x8'
+        self.block.show_captions = False
+        self.block.start_time = datetime.timedelta(seconds=1.0)
+        self.block.end_time = datetime.timedelta(seconds=60)
+        self.block.track = 'http://www.example.com/track'
+        self.block.handout = 'http://www.example.com/handout'
+        self.block.download_track = True
+        self.block.html5_sources = ['http://www.example.com/source.mp4', 'http://www.example.com/source1.ogg']
+        self.block.download_video = True
+        self.block.transcripts = {'ua': 'ukrainian_translation.srt', 'ge': 'german_translation.srt'}
+
+        xml = self.block.definition_to_xml(self.file_system)
+        parser = etree.XMLParser(remove_blank_text=True)
+        xml_string = '''\
+         <video
+            url_name="SampleProblem"
+            start_time="0:00:01"
+            show_captions="false"
+            end_time="0:01:00"
+            download_video="true"
+            download_track="true"
+            youtube="0.75:izygArpw-Qo,1.00:p2Q6BrNhdh8,1.25:1EeWXzPdhSA,1.50:rABDYkeK0x8"
+            transcripts='{"ge": "german_translation.srt", "ua": "ukrainian_translation.srt"}'
+         >
+           <source src="http://www.example.com/source.mp4"/>
+           <source src="http://www.example.com/source1.ogg"/>
+           <track src="http://www.example.com/track"/>
+           <handout src="http://www.example.com/handout"/>
+           <transcript language="ge" src="german_translation.srt" />
+           <transcript language="ua" src="ukrainian_translation.srt" />
+         </video>
+        '''
+        expected = etree.XML(xml_string, parser=parser)
+        self.assertXmlEqual(expected, xml)
+
     @patch('xmodule.video_block.video_block.edxval_api')
     def test_export_to_xml_val_error(self, mock_val_api):
         # Export should succeed without VAL data if video does not exist
diff --git a/xmodule/video_block/transcripts_utils.py b/xmodule/video_block/transcripts_utils.py
index 866edf596812..c08f9e4696bc 100644
--- a/xmodule/video_block/transcripts_utils.py
+++ b/xmodule/video_block/transcripts_utils.py
@@ -20,6 +20,7 @@
 from opaque_keys.edx.keys import UsageKeyV2
 from pysrt import SubRipFile, SubRipItem, SubRipTime
 from pysrt.srtexc import Error
+from opaque_keys.edx.locator import LibraryLocatorV2
 
 from openedx.core.djangoapps.xblock.api import get_component_from_usage_key
 from xmodule.contentstore.content import StaticContent
@@ -498,16 +499,17 @@ def manage_video_subtitles_save(item, user, old_metadata=None, generate_translat
                     remove_subs_from_store(video_id, item, lang)
 
         reraised_message = ''
-        for lang in new_langs:  # 3b
-            try:
-                generate_sjson_for_all_speeds(
-                    item,
-                    item.transcripts[lang],
-                    {speed: subs_id for subs_id, speed in youtube_speed_dict(item).items()},
-                    lang,
-                )
-            except TranscriptException:
-                pass
+        if not isinstance(item.usage_key.context_key, LibraryLocatorV2):
+            for lang in new_langs:  # 3b
+                try:
+                    generate_sjson_for_all_speeds(
+                        item,
+                        item.transcripts[lang],
+                        {speed: subs_id for subs_id, speed in youtube_speed_dict(item).items()},
+                        lang,
+                    )
+                except TranscriptException:
+                    pass
         if reraised_message:
             item.save_with_metadata(user)
             raise TranscriptException(reraised_message)
@@ -684,6 +686,18 @@ def convert_video_transcript(file_name, content, output_format):
     return dict(filename=filename, content=converted_transcript)
 
 
+def clear_transcripts(block):
+    """
+    Deletes all transcripts of a video block from VAL
+    """
+    for language_code in block.transcripts.keys():
+        edxval_api.delete_video_transcript(
+            video_id=block.edx_video_id,
+            language_code=language_code,
+        )
+    block.transcripts = {}
+
+
 class Transcript:
     """
     Container for transcript methods.
@@ -1040,6 +1054,13 @@ def get_transcript_from_contentstore(video, language, output_format, transcripts
     return transcript_content, transcript_name, Transcript.mime_types[output_format]
 
 
+def build_components_import_path(usage_key, file_path):
+    """
+    Build components import path
+    """
+    return f"components/{usage_key.block_type}/{usage_key.block_id}/{file_path}"
+
+
 def get_transcript_from_learning_core(video_block, language, output_format, transcripts_info):
     """
     Get video transcript from Learning Core (used for Content Libraries)
diff --git a/xmodule/video_block/video_block.py b/xmodule/video_block/video_block.py
index 84d7edcf7263..30af789506ed 100644
--- a/xmodule/video_block/video_block.py
+++ b/xmodule/video_block/video_block.py
@@ -855,11 +855,15 @@ def definition_to_xml(self, resource_fs):  # lint-amnesty, pylint: disable=too-m
                 if new_transcripts.get('en'):
                     xml.set('sub', '')
 
-                # Update `transcripts` attribute in the xml
-                xml.set('transcripts', json.dumps(transcripts, sort_keys=True))
-
             except edxval_api.ValVideoNotFoundError:
                 pass
+        else:
+            if transcripts.get('en'):
+                xml.set('sub', '')
+
+        if transcripts:
+            # Update `transcripts` attribute in the xml
+            xml.set('transcripts', json.dumps(transcripts, sort_keys=True))
 
             # Sorting transcripts for easy testing of resulting xml
             for transcript_language in sorted(transcripts.keys()):
diff --git a/xmodule/video_block/video_handlers.py b/xmodule/video_block/video_handlers.py
index b7857e881ece..97f40ab2776d 100644
--- a/xmodule/video_block/video_handlers.py
+++ b/xmodule/video_block/video_handlers.py
@@ -13,13 +13,14 @@
 from django.core.files.base import ContentFile
 from django.utils.timezone import now
 from edxval.api import create_external_video, create_or_update_video_transcript, delete_video_transcript
-from opaque_keys.edx.locator import CourseLocator
+from opaque_keys.edx.locator import CourseLocator, LibraryLocatorV2
 from webob import Response
 from xblock.core import XBlock
 from xblock.exceptions import JsonHandlerError
 
 from xmodule.exceptions import NotFoundError
 from xmodule.fields import RelativeTime
+from openedx.core.djangoapps.content_libraries import api as lib_api
 
 from .transcripts_utils import (
     Transcript,
@@ -467,7 +468,6 @@ def validate_transcript_upload_data(self, data):
 
         return error
 
-    # pylint: disable=too-many-statements
     @XBlock.handler
     def studio_transcript(self, request, dispatch):
         """
@@ -495,118 +495,182 @@ def studio_transcript(self, request, dispatch):
                     no SRT extension or not parse-able by PySRT
                 UnicodeDecodeError: non-UTF8 uploaded file content encoding.
         """
-        _ = self.runtime.service(self, "i18n").ugettext
-
         if dispatch.startswith('translation'):
 
             if request.method == 'POST':
-                error = self.validate_transcript_upload_data(data=request.POST)
-                if error:
-                    response = Response(json={'error': error}, status=400)
-                else:
-                    edx_video_id = clean_video_id(request.POST['edx_video_id'])
-                    language_code = request.POST['language_code']
-                    new_language_code = request.POST['new_language_code']
-                    transcript_file = request.POST['file'].file
-
-                    if not edx_video_id:
-                        # Back-populate the video ID for an external video.
-                        # pylint: disable=attribute-defined-outside-init
-                        self.edx_video_id = edx_video_id = create_external_video(display_name='external video')
-
-                    try:
-                        # Convert SRT transcript into an SJSON format
-                        # and upload it to S3.
-                        sjson_subs = Transcript.convert(
-                            content=transcript_file.read().decode('utf-8'),
-                            input_format=Transcript.SRT,
-                            output_format=Transcript.SJSON
-                        ).encode()
-                        create_or_update_video_transcript(
-                            video_id=edx_video_id,
-                            language_code=language_code,
-                            metadata={
-                                'file_format': Transcript.SJSON,
-                                'language_code': new_language_code
-                            },
-                            file_data=ContentFile(sjson_subs),
-                        )
-                        payload = {
-                            'edx_video_id': edx_video_id,
-                            'language_code': new_language_code
-                        }
-                        # If a new transcript is added, then both new_language_code and
-                        # language_code fields will have the same value.
-                        if language_code != new_language_code:
-                            self.transcripts.pop(language_code, None)
-                        self.transcripts[new_language_code] = f'{edx_video_id}-{new_language_code}.srt'
-                        response = Response(json.dumps(payload), status=201)
-                    except (TranscriptsGenerationException, UnicodeDecodeError):
-                        response = Response(
-                            json={
-                                'error': _(
-                                    'There is a problem with this transcript file. Try to upload a different file.'
-                                )
-                            },
-                            status=400
-                        )
+                response = self._studio_transcript_upload(request)
             elif request.method == 'DELETE':
-                request_data = request.json
+                response = self._studio_transcript_delete(request)
+            elif request.method == 'GET':
+                response = self._studio_transcript_get(request)
+            else:
+                # Any other HTTP method is not allowed.
+                response = Response(status=404)
+
+        else:  # unknown dispatch
+            log.debug("Dispatch is not allowed")
+            response = Response(status=404)
 
-                if 'lang' not in request_data or 'edx_video_id' not in request_data:
-                    return Response(status=400)
+        return response
 
-                language = request_data['lang']
-                edx_video_id = clean_video_id(request_data['edx_video_id'])
+    def _save_transcript_field(self):
+        """
+        Save `transcripts` block field.
+        """
+        field = self.fields['transcripts']
+        if self.transcripts:
+            transcripts_copy = self.transcripts.copy()
+            # Need to delete to overwrite, it's weird behavior,
+            # but it only works like this.
+            field.delete_from(self)
+            field.write_to(self, transcripts_copy)
+        else:
+            field.delete_from(self)
 
-                if edx_video_id:
-                    delete_video_transcript(video_id=edx_video_id, language_code=language)
+    def _studio_transcript_upload(self, request):
+        """
+        Upload transcript. Used in "POST" method in `studio_transcript`
+        """
+        _ = self.runtime.service(self, "i18n").ugettext
+        error = self.validate_transcript_upload_data(data=request.POST)
+        if error:
+            response = Response(json={'error': error}, status=400)
+        else:
+            edx_video_id = clean_video_id(request.POST['edx_video_id'])
+            language_code = request.POST['language_code']
+            new_language_code = request.POST['new_language_code']
+            transcript_file = request.POST['file'].file
 
-                if language == 'en':
-                    # remove any transcript file from content store for the video ids
-                    possible_sub_ids = [
-                        self.sub,  # pylint: disable=access-member-before-definition
-                        self.youtube_id_1_0
-                    ] + get_html5_ids(self.html5_sources)
-                    for sub_id in possible_sub_ids:
-                        remove_subs_from_store(sub_id, self, language)
+            is_library = isinstance(self.usage_key.context_key, LibraryLocatorV2)
 
-                    # update metadata as `en` can also be present in `transcripts` field
-                    remove_subs_from_store(self.transcripts.pop(language, None), self, language)
+            if is_library:
+                filename = f'transcript-{new_language_code}.srt'
+            else:
+                if not edx_video_id:
+                    # Back-populate the video ID for an external video.
+                    # pylint: disable=attribute-defined-outside-init
+                    self.edx_video_id = edx_video_id = create_external_video(display_name='external video')
+                filename = f'{edx_video_id}-{new_language_code}.srt'
 
-                    # also empty `sub` field
-                    self.sub = ''  # pylint: disable=attribute-defined-outside-init
+            try:
+                # Convert SRT transcript into an SJSON format
+                # and upload it to S3.
+                content = transcript_file.read()
+                payload = {
+                    'edx_video_id': edx_video_id,
+                    'language_code': new_language_code
+                }
+                if is_library:
+                    # Save transcript as static asset in Learning Core if is a library component
+                    filename = f"static/{filename}"
+                    lib_api.add_library_block_static_asset_file(
+                        self.usage_key,
+                        filename,
+                        content,
+                    )
                 else:
-                    remove_subs_from_store(self.transcripts.pop(language, None), self, language)
-
-                return Response(status=200)
+                    sjson_subs = Transcript.convert(
+                        content=content.decode('utf-8'),
+                        input_format=Transcript.SRT,
+                        output_format=Transcript.SJSON
+                    ).encode()
+                    create_or_update_video_transcript(
+                        video_id=edx_video_id,
+                        language_code=language_code,
+                        metadata={
+                            'file_format': Transcript.SJSON,
+                            'language_code': new_language_code
+                        },
+                        file_data=ContentFile(sjson_subs),
+                    )
 
-            elif request.method == 'GET':
-                language = request.GET.get('language_code')
-                if not language:
-                    return Response(json={'error': _('Language is required.')}, status=400)
+                # If a new transcript is added, then both new_language_code and
+                # language_code fields will have the same value.
+                if language_code != new_language_code:
+                    self.transcripts.pop(language_code, None)
+                self.transcripts[new_language_code] = filename
 
-                try:
-                    transcript_content, transcript_name, mime_type = get_transcript(
-                        video=self, lang=language, output_format=Transcript.SRT
-                    )
-                    response = Response(transcript_content, headerlist=[
-                        (
-                            'Content-Disposition',
-                            f'attachment; filename="{transcript_name}"'
-                        ),
-                        ('Content-Language', language),
-                        ('Content-Type', mime_type)
-                    ])
-                except (UnicodeDecodeError, TranscriptsGenerationException, NotFoundError):
-                    response = Response(status=404)
+                if is_library:
+                    self._save_transcript_field()
+                response = Response(json.dumps(payload), status=201)
+            except (TranscriptsGenerationException, UnicodeDecodeError):
+                response = Response(
+                    json={
+                        'error': _(
+                            'There is a problem with this transcript file. Try to upload a different file.'
+                        )
+                    },
+                    status=400
+                )
+        return response
 
+    def _studio_transcript_delete(self, request):
+        """
+        Delete transcript. Used in "DELETE" method in `studio_transcript`
+        """
+        request_data = request.json
+
+        if 'lang' not in request_data or 'edx_video_id' not in request_data:
+            return Response(status=400)
+
+        language = request_data['lang']
+        edx_video_id = clean_video_id(request_data['edx_video_id'])
+
+        if edx_video_id:
+            delete_video_transcript(video_id=edx_video_id, language_code=language)
+
+        if isinstance(self.usage_key.context_key, LibraryLocatorV2):
+            transcript_name = self.transcripts.pop(language, None)
+            if transcript_name:
+                # TODO: In the future, we need a proper XBlock API
+                # like `self.static_assets.delete(...)` instead of coding
+                # these runtime-specific/library-specific APIs.
+                lib_api.delete_library_block_static_asset_file(
+                    self.usage_key,
+                    f"static/{transcript_name}",
+                )
+                self._save_transcript_field()
+        else:
+            if language == 'en':
+                # remove any transcript file from content store for the video ids
+                possible_sub_ids = [
+                    self.sub,  # pylint: disable=access-member-before-definition
+                    self.youtube_id_1_0
+                ] + get_html5_ids(self.html5_sources)
+                for sub_id in possible_sub_ids:
+                    remove_subs_from_store(sub_id, self, language)
+
+                # update metadata as `en` can also be present in `transcripts` field
+                remove_subs_from_store(self.transcripts.pop(language, None), self, language)
+
+                # also empty `sub` field
+                self.sub = ''  # pylint: disable=attribute-defined-outside-init
             else:
-                # Any other HTTP method is not allowed.
-                response = Response(status=404)
+                remove_subs_from_store(self.transcripts.pop(language, None), self, language)
 
-        else:  # unknown dispatch
-            log.debug("Dispatch is not allowed")
-            response = Response(status=404)
+        return Response(status=200)
 
+    def _studio_transcript_get(self, request):
+        """
+        Get transcript. Used in "GET" method in `studio_transcript`
+        """
+        _ = self.runtime.service(self, "i18n").ugettext
+        language = request.GET.get('language_code')
+        if not language:
+            return Response(json={'error': _('Language is required.')}, status=400)
+
+        try:
+            transcript_content, transcript_name, mime_type = get_transcript(
+                video=self, lang=language, output_format=Transcript.SRT
+            )
+            response = Response(transcript_content, headerlist=[
+                (
+                    'Content-Disposition',
+                    f'attachment; filename="{transcript_name}"'
+                ),
+                ('Content-Language', language),
+                ('Content-Type', mime_type)
+            ])
+        except (UnicodeDecodeError, TranscriptsGenerationException, NotFoundError):
+            response = Response(status=404)
         return response

From ecf5aee2974f4add888ebc70aad480413dd74dbb Mon Sep 17 00:00:00 2001
From: Deborah Kaplan <deborahgu@users.noreply.github.com>
Date: Fri, 21 Feb 2025 14:43:04 -0500
Subject: [PATCH 2/3] feat: management command to purge information about web
 certificates (#36287)

for Open edX  operators who still have users with legacy PDF certificates, retirement requires first extracting information from the user's GeneratedCertificate record in order to delete the 4 associated files for each PDF certificate, and then removing the links to the relevant files.  this creates a management command to do that work.

After thinking about it, I have removed the update to `status` from this management command, as per the original specification of the ticket. I added it for completeness originally, but was already uncomfortable, because it's not exactly accurate. The `CertificateStatuses` enum does define a `deleted` status:

```
    deleted             - The PDF certificate has been deleted.
```

but I think it's inappropriate to use here.

#### Why not use `CertificateStatuses.deleted` in the first place

There are multiple places in the code where it's clear that almost all of the statuses are legacy and unused (eg. [Example 1](https://github.com/openedx/edx-platform/blob/6c6fd84e533e6cb272da2e618f227225ad907232/lms/djangoapps/certificates/data.py#L12-L34), [Example 2](https://github.com/openedx/edx-platform/blob/1029de5537752ecf0477d578dba8a932e741e497/common/djangoapps/student/helpers.py#L491-L492)). There are innumerable APIs in the system that have expectations about what might possibly be returned from a `GeneratedCertificate.status` object, and none of them is expecting `deleted`

#### Why not revoke the certificate

Ultimately, the certificate isn't revoked, which has a specific meaning around saying it was unearned. The certificate was earned; it has simply been deleted.  We should not be kicking off program certificate invalidation, because that's not what's happening. We should be trusting the normal user retirement process to remove/purge PII from any program certificates that might exist. The nature of web certificates simply means that we are going through this process outside of the normal retirement flow. The normal retirement flow can be trusted to implement any certificate object revocation/removal/PII-purging, and doing an extra step outside of that flow is counterproductive.

#### Why not robustly add a flow for `CertificateStatuses.deleted`

When PDF certificates were removed from the system, they weren't removed in their entirety. Instead, we have this vestigial remains of PDF certificates code, just enough to allow learners to display and use the ones that they already have, without any of the support systems for modifying them. Adding a `deleted` status, verifying that all other APIs wouldn't break in the presence of a certificate with that status, adding the signals to process and propagate the change: all of this would be adding more tech debt upon the already existing technical debt which is the PDF certs code. Better to simply add this one necessary data integrity change, and focus on a process which might allow us to eventually remove the web certificates code.

#### Why it is good enough to ignore the status

The original ask was simply to enforce data integrity: to remove links to files that have been deleted, as an indication that they've been deleted. I only added `status` update out of a (misplaced but well-intentioned) completionist urge.

FIXES: APER-3889
---
 lms/djangoapps/certificates/admin.py          |   6 +
 .../purge_references_to_pdf_certificates.py   | 104 ++++++++++++++++++
 ...st_purge_references_to_pdf_certificates.py | 101 +++++++++++++++++
 ...8_pdf_certificate_purge_data_management.py |  29 +++++
 lms/djangoapps/certificates/models.py         |  24 ++++
 5 files changed, 264 insertions(+)
 create mode 100644 lms/djangoapps/certificates/management/commands/purge_references_to_pdf_certificates.py
 create mode 100644 lms/djangoapps/certificates/management/commands/tests/test_purge_references_to_pdf_certificates.py
 create mode 100644 lms/djangoapps/certificates/migrations/0038_pdf_certificate_purge_data_management.py

diff --git a/lms/djangoapps/certificates/admin.py b/lms/djangoapps/certificates/admin.py
index 3facf6f6b530..ffa225321201 100644
--- a/lms/djangoapps/certificates/admin.py
+++ b/lms/djangoapps/certificates/admin.py
@@ -22,6 +22,7 @@
     CertificateTemplateAsset,
     GeneratedCertificate,
     ModifiedCertificateTemplateCommandConfiguration,
+    PurgeReferencestoPDFCertificatesCommandConfiguration,
 )
 
 
@@ -103,6 +104,11 @@ class CertificateGenerationCommandConfigurationAdmin(ConfigurationModelAdmin):
     pass
 
 
+@admin.register(PurgeReferencestoPDFCertificatesCommandConfiguration)
+class PurgeReferencestoPDFCertificatesCommandConfigurationAdmin(ConfigurationModelAdmin):
+    pass
+
+
 class CertificateDateOverrideAdmin(admin.ModelAdmin):
     """
     # Django admin customizations for CertificateDateOverride model
diff --git a/lms/djangoapps/certificates/management/commands/purge_references_to_pdf_certificates.py b/lms/djangoapps/certificates/management/commands/purge_references_to_pdf_certificates.py
new file mode 100644
index 000000000000..384e58b30831
--- /dev/null
+++ b/lms/djangoapps/certificates/management/commands/purge_references_to_pdf_certificates.py
@@ -0,0 +1,104 @@
+"""
+A management command designed to be part of the retirement pipeline for any Open EdX operators
+with users who still have legacy PDF certificates.
+
+Once an external process has run to remove the four files comprising a legacy PDF certificate,
+this management command will remove the reference to the file from the certificate record.
+
+Note: it is important to retain the reference in the certificate table until
+the files have been deleted, because that reference is the files' identifying descriptor.
+"""
+
+import logging
+import shlex
+
+from django.contrib.auth import get_user_model
+from django.core.management.base import BaseCommand, CommandError
+
+from lms.djangoapps.certificates.models import (
+    GeneratedCertificate,
+    PurgeReferencestoPDFCertificatesCommandConfiguration,
+)
+
+User = get_user_model()
+log = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    """
+    Doesn't invoke the custom save() function defined as part of the `GeneratedCertificate`
+    model; perforce will emit no Django signals. This is desired behavior. We are
+    using this management command to purge information that was never sent to any
+    other systems, so we don't need to propagate updates.
+
+    Example usage:
+
+    # Dry Run (preview changes):
+    $ ./manage.py lms purge_references_to_pdf_certificates --dry-run
+
+    # Purge data:
+    $ ./manage.py lms purge_references_to_pdf_certificates
+    """
+
+    help = """Purges references to PDF certificates. Intended to be run after the files have been deleted."""
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="Shows a preview of what users would be affected by running this management command.",
+        )
+        parser.add_argument(
+            "--certificate_ids",
+            nargs="+",
+            dest="certificate_ids",
+            help="space-separated list of GeneratedCertificate IDs to clean up",
+        )
+        parser.add_argument(
+            "--args-from-database",
+            action="store_true",
+            help=(
+                "Use arguments from the PurgeReferencesToPDFCertificatesCommandConfiguration "
+                "model instead of the command line"
+            ),
+        )
+
+    def get_args_from_database(self):
+        """
+        Returns an options dictionary from the current CertificateGenerationCommandConfiguration model.
+        """
+        config = PurgeReferencestoPDFCertificatesCommandConfiguration.current()
+        if not config.enabled:
+            raise CommandError(
+                "PurgeReferencestoPDFCertificatesCommandConfiguration is disabled, "
+                "but --args-from-database was requested"
+            )
+
+        args = shlex.split(config.arguments)
+        parser = self.create_parser("manage.py", "purge_references_to_pdf_certificates")
+
+        return vars(parser.parse_args(args))
+
+    def handle(self, *args, **options):
+        # database args will override cmd line args
+        if options["args_from_database"]:
+            options = self.get_args_from_database()
+
+        if options["dry_run"]:
+            dry_run_string = "[DRY RUN] "
+        else:
+            dry_run_string = ""
+
+        certificate_ids = options.get("certificate_ids")
+        if not certificate_ids:
+            raise CommandError("You must specify one or more certificate IDs")
+
+        log.info(
+            f"{dry_run_string}Purging download_url and download_uri "
+            f"from the following certificate records: {certificate_ids}"
+        )
+        if not options["dry_run"]:
+            GeneratedCertificate.objects.filter(id__in=certificate_ids).update(
+                download_url="",
+                download_uuid="",
+            )
diff --git a/lms/djangoapps/certificates/management/commands/tests/test_purge_references_to_pdf_certificates.py b/lms/djangoapps/certificates/management/commands/tests/test_purge_references_to_pdf_certificates.py
new file mode 100644
index 000000000000..b831aa813fdc
--- /dev/null
+++ b/lms/djangoapps/certificates/management/commands/tests/test_purge_references_to_pdf_certificates.py
@@ -0,0 +1,101 @@
+"""
+Tests for the `purge_references_to_pdf_certificates` management command.
+"""
+
+import uuid
+
+from django.core.management import CommandError, call_command
+from testfixtures import LogCapture
+
+from common.djangoapps.student.tests.factories import UserFactory
+from lms.djangoapps.certificates.models import GeneratedCertificate
+from lms.djangoapps.certificates.tests.factories import GeneratedCertificateFactory
+from xmodule.modulestore.tests.django_utils import ModuleStoreTestCase
+from xmodule.modulestore.tests.factories import CourseFactory
+
+
+class PurgeReferencesToPDFCertificatesTests(ModuleStoreTestCase):
+    """
+    Tests for the `purge_references_to_pdf_certificates` management command.
+    """
+
+    def setUp(self):
+        super().setUp()
+        self.user = UserFactory()
+        self.course_run_1 = CourseFactory()
+        self.course_run_2 = CourseFactory()
+        self.course_run_3 = CourseFactory()
+        self.cert_1 = GeneratedCertificateFactory(
+            user=self.user,
+            course_id=self.course_run_1.id,
+            download_url="http://example.com/1",
+            download_uuid=uuid.uuid4(),
+            grade=1.00,
+        )
+        self.cert_2 = GeneratedCertificateFactory(
+            user=self.user,
+            course_id=self.course_run_2.id,
+            download_url="http://example.com/2",
+            download_uuid=uuid.uuid4(),
+            grade=2.00,
+        )
+        self.cert_3 = GeneratedCertificateFactory(
+            user=self.user,
+            course_id=self.course_run_3.id,
+            download_url="http://example.com/3",
+            download_uuid=uuid.uuid4(),
+            grade=3.00,
+        )
+
+    def test_command_with_missing_certificate_ids(self):
+        """
+        Verify command with a missing certificate_ids param.
+        """
+        with self.assertRaises(CommandError):
+            call_command("purge_references_to_pdf_certificates")
+
+    def test_management_command(self):
+        """
+        Verify the management command purges expected data from only the certs requested.
+        """
+        call_command(
+            "purge_references_to_pdf_certificates",
+            "--certificate_ids",
+            self.cert_2.id,
+            self.cert_3.id,
+        )
+
+        cert1_post = GeneratedCertificate.objects.get(id=self.cert_1.id)
+        cert2_post = GeneratedCertificate.objects.get(id=self.cert_2.id)
+        cert3_post = GeneratedCertificate.objects.get(id=self.cert_3.id)
+        self.assertEqual(cert1_post.download_url, "http://example.com/1")
+        self.assertNotEqual(cert1_post.download_uuid, "")
+
+        self.assertEqual(cert2_post.download_url, "")
+        self.assertEqual(cert2_post.download_uuid, "")
+
+        self.assertEqual(cert3_post.download_url, "")
+        self.assertEqual(cert3_post.download_uuid, "")
+
+    def test_management_command_dry_run(self):
+        """
+        Verify that the management command does not purge any data when invoked with the `--dry-run` flag
+        """
+        expected_log_msg = (
+            "[DRY RUN] Purging download_url and download_uri "
+            f"from the following certificate records: {list(str(self.cert_3.id))}"
+        )
+
+        with LogCapture() as logger:
+            call_command(
+                "purge_references_to_pdf_certificates",
+                "--dry-run",
+                "--certificate_ids",
+                self.cert_3.id,
+            )
+
+        cert3_post = GeneratedCertificate.objects.get(id=self.cert_3.id)
+        self.assertEqual(cert3_post.download_url, "http://example.com/3")
+        self.assertNotEqual(cert3_post.download_uuid, "")
+
+        assert logger.records[0].msg == expected_log_msg
diff --git a/lms/djangoapps/certificates/migrations/0038_pdf_certificate_purge_data_management.py b/lms/djangoapps/certificates/migrations/0038_pdf_certificate_purge_data_management.py
new file mode 100644
index 000000000000..6816719975e2
--- /dev/null
+++ b/lms/djangoapps/certificates/migrations/0038_pdf_certificate_purge_data_management.py
@@ -0,0 +1,29 @@
+# Generated by Django 4.2.18 on 2025-02-20 22:36
+
+from django.conf import settings
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+        ('certificates', '0037_fix_legacy_broken_invalid_certs'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='PurgeReferencestoPDFCertificatesCommandConfiguration',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('change_date', models.DateTimeField(auto_now_add=True, verbose_name='Change date')),
+                ('enabled', models.BooleanField(default=False, verbose_name='Enabled')),
+                ('arguments', models.TextField(blank=True, default='', help_text="Arguments for the 'purge_references_to_pdf_certificates' management command. Specify like '--certificate_ids <id1> <id2>'")),
+                ('changed_by', models.ForeignKey(editable=False, null=True, on_delete=django.db.models.deletion.PROTECT, to=settings.AUTH_USER_MODEL, verbose_name='Changed by')),
+            ],
+            options={
+                'verbose_name': 'purge_references_to_pdf_certificates argument',
+            },
+        ),
+    ]
diff --git a/lms/djangoapps/certificates/models.py b/lms/djangoapps/certificates/models.py
index 9c58327b99aa..61ce095aecdc 100644
--- a/lms/djangoapps/certificates/models.py
+++ b/lms/djangoapps/certificates/models.py
@@ -1289,6 +1289,30 @@ def __str__(self):
         return str(self.arguments)
 
 
+class PurgeReferencestoPDFCertificatesCommandConfiguration(ConfigurationModel):
+    """
+    Manages configuration for a run of the purge_references_to_pdf_certificates management command.
+
+    .. no_pii:
+    """
+
+    class Meta:
+        app_label = "certificates"
+        verbose_name = "purge_references_to_pdf_certificates argument"
+
+    arguments = models.TextField(
+        blank=True,
+        help_text=(
+            "Arguments for the 'purge_references_to_pdf_certificates' management command. "
+            "Specify like '--certificate_ids <id1> <id2>'"
+        ),
+        default="",
+    )
+
+    def __str__(self):
+        return str(self.arguments)
+
+
 class CertificateDateOverride(TimeStampedModel):
     """
     Model to manually override a given certificate date with the given date.

From 2798f28c2ab58b8acbc5d676488f32ab269a1cbd Mon Sep 17 00:00:00 2001
From: Muhammad Faraz  Maqsood <faraz.maqsood@A006-01130.local>
Date: Thu, 20 Feb 2025 08:05:47 +0500
Subject: [PATCH 3/3] fix: send correct status for scanning

---
 cms/djangoapps/contentstore/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py
index c2803f3e2751..f7f8505f048b 100644
--- a/cms/djangoapps/contentstore/tasks.py
+++ b/cms/djangoapps/contentstore/tasks.py
@@ -1129,7 +1129,7 @@ def _check_broken_links(task_instance, user_id, course_key_string, language):
     """
     user = _validate_user(task_instance, user_id, language)
 
-    task_instance.status.set_state('Scanning')
+    task_instance.status.set_state(UserTaskStatus.IN_PROGRESS)
     course_key = CourseKey.from_string(course_key_string)
 
     url_list = _scan_course_for_links(course_key)