|
6 | 6 | require 'json' |
7 | 7 | require 'securerandom' |
8 | 8 | require 'openssl' |
| 9 | +require 'nokogiri' |
9 | 10 |
|
10 | 11 | # Configure SSL context globally at module load time to work around CRL verification issues |
11 | 12 | # This is a production-safe workaround for OpenSSL 3.6+ that disables CRL checking |
@@ -323,6 +324,181 @@ def localize_chat(chat, target_locale:, source_locale: nil, fast: nil, reference |
323 | 324 | response[:chat] || [] |
324 | 325 | end |
325 | 326 |
|
| 327 | + # Localizes an HTML document while preserving structure and formatting. |
| 328 | + # |
| 329 | + # Handles both text content and localizable attributes (alt, title, placeholder, meta content). |
| 330 | + # |
| 331 | + # @param html [String] the HTML document string to be localized |
| 332 | + # @param target_locale [String] the target locale code (e.g., 'es', 'fr', 'ja') |
| 333 | + # @param source_locale [String, nil] the source locale code (optional, auto-detected if not provided) |
| 334 | + # @param fast [Boolean, nil] enable fast mode for quicker results (optional) |
| 335 | + # @param reference [Hash, nil] additional context for translation (optional) |
| 336 | + # @param on_progress [Proc, nil] callback for progress updates (optional) |
| 337 | + # @param concurrent [Boolean] enable concurrent processing (default: false) |
| 338 | + # |
| 339 | + # @yield [progress] optional block for progress tracking |
| 340 | + # @yieldparam progress [Integer] completion percentage (0-100) |
| 341 | + # |
| 342 | + # @return [String] the localized HTML document as a string, with updated lang attribute |
| 343 | + # |
| 344 | + # @raise [ValidationError] if target_locale is missing or html is nil |
| 345 | + # @raise [APIError] if the API request fails |
| 346 | + # |
| 347 | + # @example Basic usage |
| 348 | + # html = '<html><head><title>Hello</title></head><body><p>World</p></body></html>' |
| 349 | + # result = engine.localize_html(html, target_locale: 'es') |
| 350 | + # # => "<html lang=\"es\">..." |
| 351 | + def localize_html(html, target_locale:, source_locale: nil, fast: nil, reference: nil, on_progress: nil, concurrent: false, &block) |
| 352 | + raise ValidationError, 'Target locale is required' if target_locale.nil? || target_locale.empty? |
| 353 | + raise ValidationError, 'HTML cannot be nil' if html.nil? |
| 354 | + |
| 355 | + callback = block || on_progress |
| 356 | + |
| 357 | + doc = Nokogiri::HTML::Document.parse(html) |
| 358 | + |
| 359 | + localizable_attributes = { |
| 360 | + 'meta' => ['content'], |
| 361 | + 'img' => ['alt'], |
| 362 | + 'input' => ['placeholder'], |
| 363 | + 'a' => ['title'] |
| 364 | + } |
| 365 | + |
| 366 | + unlocalizable_tags = ['script', 'style'] |
| 367 | + |
| 368 | + extracted_content = {} |
| 369 | + |
| 370 | + get_path = lambda do |node, attribute = nil| |
| 371 | + indices = [] |
| 372 | + current = node |
| 373 | + root_parent = nil |
| 374 | + |
| 375 | + while current |
| 376 | + parent = current.parent |
| 377 | + break unless parent |
| 378 | + |
| 379 | + if parent == doc.root |
| 380 | + root_parent = current.name.downcase if current.element? |
| 381 | + break |
| 382 | + end |
| 383 | + |
| 384 | + siblings = parent.children.select do |n| |
| 385 | + (n.element? || (n.text? && n.text.strip != '')) |
| 386 | + end |
| 387 | + |
| 388 | + index = siblings.index(current) |
| 389 | + if index |
| 390 | + indices.unshift(index) |
| 391 | + end |
| 392 | + |
| 393 | + current = parent |
| 394 | + end |
| 395 | + |
| 396 | + base_path = root_parent ? "#{root_parent}/#{indices.join('/')}" : indices.join('/') |
| 397 | + attribute ? "#{base_path}##{attribute}" : base_path |
| 398 | + end |
| 399 | + |
| 400 | + process_node = lambda do |node| |
| 401 | + parent = node.parent |
| 402 | + while parent && !parent.is_a?(Nokogiri::XML::Document) |
| 403 | + if parent.element? && unlocalizable_tags.include?(parent.name.downcase) |
| 404 | + return |
| 405 | + end |
| 406 | + parent = parent.parent |
| 407 | + end |
| 408 | + |
| 409 | + if node.text? |
| 410 | + text = node.text.strip |
| 411 | + if text != '' |
| 412 | + extracted_content[get_path.call(node)] = text |
| 413 | + end |
| 414 | + elsif node.element? |
| 415 | + element = node |
| 416 | + tag_name = element.name.downcase |
| 417 | + attributes = localizable_attributes[tag_name] || [] |
| 418 | + attributes.each do |attr| |
| 419 | + value = element[attr] |
| 420 | + if value && value.strip != '' |
| 421 | + extracted_content[get_path.call(element, attr)] = value |
| 422 | + end |
| 423 | + end |
| 424 | + |
| 425 | + element.children.each do |child| |
| 426 | + process_node.call(child) |
| 427 | + end |
| 428 | + end |
| 429 | + end |
| 430 | + |
| 431 | + head = doc.at_css('head') |
| 432 | + if head |
| 433 | + head.children.select do |n| |
| 434 | + n.element? || (n.text? && n.text.strip != '') |
| 435 | + end.each do |child| |
| 436 | + process_node.call(child) |
| 437 | + end |
| 438 | + end |
| 439 | + |
| 440 | + body = doc.at_css('body') |
| 441 | + if body |
| 442 | + body.children.select do |n| |
| 443 | + n.element? || (n.text? && n.text.strip != '') |
| 444 | + end.each do |child| |
| 445 | + process_node.call(child) |
| 446 | + end |
| 447 | + end |
| 448 | + |
| 449 | + localized_content = localize_raw( |
| 450 | + extracted_content, |
| 451 | + target_locale: target_locale, |
| 452 | + source_locale: source_locale, |
| 453 | + fast: fast, |
| 454 | + reference: reference, |
| 455 | + concurrent: concurrent |
| 456 | + ) do |progress, chunk, processed_chunk| |
| 457 | + callback&.call(progress) |
| 458 | + end |
| 459 | + |
| 460 | + doc.root['lang'] = target_locale if doc.root |
| 461 | + |
| 462 | + localized_content.each do |path, value| |
| 463 | + node_path, attribute = path.split('#') |
| 464 | + parts = node_path.split('/') |
| 465 | + root_tag = parts[0] |
| 466 | + indices = parts[1..-1] |
| 467 | + |
| 468 | + parent = root_tag == 'head' ? doc.at_css('head') : doc.at_css('body') |
| 469 | + next unless parent |
| 470 | + current = parent |
| 471 | + |
| 472 | + indices.each do |index_str| |
| 473 | + index = index_str.to_i |
| 474 | + siblings = parent.children.select do |n| |
| 475 | + (n.element? || (n.text? && n.text.strip != '')) |
| 476 | + end |
| 477 | + |
| 478 | + current = siblings[index] |
| 479 | + break unless current |
| 480 | + |
| 481 | + if current.element? |
| 482 | + parent = current |
| 483 | + end |
| 484 | + end |
| 485 | + |
| 486 | + if current |
| 487 | + if attribute |
| 488 | + if current.element? |
| 489 | + current[attribute] = value |
| 490 | + end |
| 491 | + else |
| 492 | + if current.text? |
| 493 | + current.content = value |
| 494 | + end |
| 495 | + end |
| 496 | + end |
| 497 | + end |
| 498 | + |
| 499 | + doc.to_html |
| 500 | + end |
| 501 | + |
326 | 502 | # Localizes text to multiple target locales. |
327 | 503 | # |
328 | 504 | # @param text [String] the text to localize |
|
0 commit comments