Skip to content

Commit

Permalink
Merge pull request #6189 from avalonmediasystem/subtitle_metadata_ext…
Browse files Browse the repository at this point in the history
…raction

Handle embedded subtitle metadata (label, language)
  • Loading branch information
masaball authored Feb 24, 2025
2 parents e0bb92a + acaa95f commit 99d8331
Show file tree
Hide file tree
Showing 9 changed files with 903 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ GIT

GIT
remote: https://github.com/samvera-labs/active_encode.git
revision: adecfb1503c2a706661f716ff041ca7ad3c9c3d7
revision: 25c839f7fd88f1a42dbc56ba8ad3a0cfef1d4a6f
branch: main
specs:
active_encode (1.2.3)
Expand Down
81 changes: 72 additions & 9 deletions app/models/language_term.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,88 @@
class LanguageTerm
class LookupError < Exception; end

Store = File.join(Rails.root, 'config/iso639-2.yml')
class Iso6391 < LanguageTerm
STORE = File.join(Rails.root, 'config/iso639-1.yml')
VOCABULARY = 'http://id.loc.gov/vocabulary/iso639-1.tsv'

class << self
def convert_to_6392(term)
raise LookupError, "Incorrect number of characters. Term must be a string of 2 alphabetic characters." unless /[a-zA-Z]{2}/.match?(term)
lang_text = search(term).text
# ISO 639-1 can have multiple languages defined for a single code:
# 'es': "Spanish | Castilian".
# ISO 639-2 does not follow the same convention, so we iterate through
# any multi languages until we get a match from the ISO 639-2 standard.
lang_text = lang_text.split('|').map(&:strip) if lang_text.include?('|')
Array(lang_text).each do |text|
begin
@alpha3 = Iso6392.search(text)
break
rescue LookupError
next
end
end

raise LookupError, "Unknown language: `'#{value}" if @alpha3.nil?

return @alpha3
end

def map
@@map_alpha2 ||= self.load!
end
end
end

class Iso6392 < LanguageTerm
STORE = File.join(Rails.root, 'config/iso639-2.yml')
VOCABULARY = 'http://id.loc.gov/vocabulary/languages.tsv'

class << self
def map
@@map ||= self.load!
end
end
end

class << self
def map
@@map ||= self.load!
def find(term)
case term.length
when 2
Iso6391.convert_to_6392(term)
else
Iso6392.search(term)
end
end
alias_method :[], :find

def find(value)
def search(value)
result = self.map[value.downcase]
result = self.map.select{ |k,v| v[:text]==value }.values.first if result.nil?
raise LookupError, "Unknown language: `#{value}'" if result.nil?
self.new(result)
end
alias_method :[], :find

def autocomplete(query, _id = nil)
map = query.present? ? self.map.select{ |k,v| /#{query}/i.match(v[:text]) if v } : self.map
map.to_a.uniq.map{ |e| {id: e[1][:code], display: e[1][:text] }}.sort{ |x,y| x[:display]<=>y[:display] }
end

def load!
if File.exist?(Store)
YAML.load(File.read(Store))
if File.exist?(store)
YAML.load(File.read(store))
else
harvest!
end
end

def harvest!
language_map = {}
doc = RestClient.get('http://id.loc.gov/vocabulary/languages.tsv').split(/\n/).collect{ |l| l.split(/\t/) }
doc = RestClient.get(vocabulary).split(/\n/).collect{ |l| l.split(/\t/) }
doc.shift
doc.each { |entry| language_map[entry[1].to_s] = { code: entry[1].to_s, text: entry[2].to_s, uri: entry[0].to_s } }
begin
File.open(Store,'w') { |f| f.write(YAML.dump(language_map)) }
File.open(store,'w') { |f| f.write(YAML.dump(language_map)) }
rescue
# Don't care if we can't cache it
end
Expand All @@ -68,4 +115,20 @@ def code
def text
@term[:text]
end

def store
self.class::STORE
end

def self.store
self::STORE
end

def vocabulary
self.class::VOCABULARY
end

def self.vocabulary
self::VOCABULARY
end
end
2 changes: 1 addition & 1 deletion app/models/media_object.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def validate_note_type
end

def validate_language
Array(language).each{|i|errors.add(:language, "Language not recognized (#{i[:code]})") unless LanguageTerm::map[i[:code]] }
Array(language).each{|i|errors.add(:language, "Language not recognized (#{i[:code]})") unless LanguageTerm::Iso6392.map[i[:code]] }
end

def validate_related_items
Expand Down
2 changes: 1 addition & 1 deletion app/models/supplemental_file.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class SupplementalFile < ApplicationRecord

# TODO: the empty tag should represent a generic supplemental file
validates :tags, array_inclusion: ['transcript', 'caption', 'machine_generated', '', nil]
validates :language, inclusion: { in: LanguageTerm.map.keys }
validates :language, inclusion: { in: LanguageTerm::Iso6392.map.keys }
validates :parent_id, presence: true
validate :validate_file_type, if: :caption?

Expand Down
8 changes: 8 additions & 0 deletions app/models/watched_encode.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ class WatchedEncode < ActiveEncode::Base
encode.output.collect! do |output|
if output.format == "vtt"
new_file = SupplementalFile.new(tags: ['caption'], parent_id: record.master_file_id)
new_file.label = output.label.presence
if output.language.present?
begin
new_file.language = LanguageTerm.find(output.language).code
rescue LanguageTerm::LookupError
new_file.language = nil
end
end
new_file.attach_file(FileLocator.new(output.url).location, io: true)
new_file.save
output.url = if Settings.active_storage.bucket.present?
Expand Down
Loading

0 comments on commit 99d8331

Please sign in to comment.