@@ -508,7 +508,6 @@ def repl_svg(matchobj):
508508
509509 return content
510510
511-
512511def _resize_and_copy_figure (
513512 filename ,
514513 origin_folder ,
@@ -517,37 +516,170 @@ def _resize_and_copy_figure(
517516 image_size ,
518517 compress_pdf ,
519518 pdf_resolution ,
519+ convert_png_to_jpg = False ,
520+ png_quality = 50 ,
521+ png_size_threshold = 0.5 ,
522+ verbose = False
520523):
521- """Resizes and copies the input figure (either JPG, PNG, or PDF)."""
522- _create_dir_if_not_exists (
523- os .path .join (destination_folder , os .path .dirname (filename ))
524- )
525-
526- if resize_image and os .path .splitext (filename )[1 ].lower () in [
527- '.jpg' ,
528- '.jpeg' ,
529- '.png' ,
530- ]:
531- im = Image .open (os .path .join (origin_folder , filename ))
532- if max (im .size ) > image_size :
533- im = im .resize (
534- tuple ([int (x * float (image_size ) / max (im .size )) for x in im .size ]),
535- Image .Resampling .LANCZOS ,
536- )
537- if os .path .splitext (filename )[1 ].lower () in ['.jpg' , '.jpeg' ]:
538- im .save (os .path .join (destination_folder , filename ), 'JPEG' , quality = 90 )
539- elif os .path .splitext (filename )[1 ].lower () in ['.png' ]:
540- im .save (os .path .join (destination_folder , filename ), 'PNG' )
541-
542- elif compress_pdf and os .path .splitext (filename )[1 ].lower () == '.pdf' :
543- _resize_pdf_figure (
544- filename , origin_folder , destination_folder , pdf_resolution
545- )
546- else :
547- shutil .copy (
548- os .path .join (origin_folder , filename ),
549- os .path .join (destination_folder , filename ),
524+ """Resizes and copies the input figure (either JPG, PNG, or PDF).
525+
526+ Parameters:
527+ filename: The input filename
528+ origin_folder: The folder containing the input filename
529+ destination_folder: The folder to copy the output filename to
530+ resize_image: Whether to resize the image
531+ image_size: The maximum size of the image in pixels
532+ compress_pdf: Whether to compress the PDF file
533+ convert_png_to_jpg: Whether to convert PNG files to JPG format. Note that this will override resize_image for PNG files.
534+ png_quality: JPG quality for converted PNG files (0-100)
535+ png_size_threshold: Minimum file size in MB to apply quality reduction
536+ verbose: Enable verbose logging
537+
538+ Returns:
539+ str: The actual output filename (may differ from input if PNG was converted)
540+ """
541+ _create_dir_if_not_exists (
542+ os .path .join (destination_folder , os .path .dirname (filename ))
550543 )
544+
545+ if convert_png_to_jpg and os .path .splitext (filename )[1 ].lower () in ['.png' ]:
546+ original_size_mb = os .path .getsize (os .path .join (origin_folder , filename )) / (1024 * 1024 )
547+ im = Image .open (os .path .join (origin_folder , filename ))
548+ # Determine quality based on file size
549+ if original_size_mb < png_size_threshold :
550+ quality = 100 # Keep high quality for small files
551+ if verbose :
552+ print (f"Keeping original quality for small PNG: { filename } " )
553+ else :
554+ quality = png_quality
555+ if verbose :
556+ print (f"Converting PNG to JPG with quality { quality } : { filename } " )
557+
558+ # Convert PNG to JPG
559+ output_filename = os .path .splitext (filename )[0 ] + '.jpg'
560+ rgb_img = im .convert ('RGB' )
561+ rgb_img .save (os .path .join (destination_folder , output_filename ), 'JPEG' , quality = quality )
562+
563+ if verbose :
564+ print (f"Converted: { filename } -> { output_filename } " )
565+
566+ return output_filename
567+
568+ if resize_image and os .path .splitext (filename )[1 ].lower () in [
569+ '.jpg' ,
570+ '.jpeg' ,
571+ '.png' ,
572+ ]:
573+ try :
574+ im = Image .open (os .path .join (origin_folder , filename ))
575+ if max (im .size ) > image_size :
576+ im = im .resize (
577+ tuple ([int (x * float (image_size ) / max (im .size )) for x in im .size ]),
578+ Image .Resampling .LANCZOS ,
579+ )
580+
581+ if os .path .splitext (filename )[1 ].lower () in ['.jpg' , '.jpeg' ]:
582+ im .save (os .path .join (destination_folder , filename ), 'JPEG' , quality = 90 )
583+ return filename
584+
585+ elif os .path .splitext (filename )[1 ].lower () in ['.png' ]:
586+ im .save (os .path .join (destination_folder , filename ), 'PNG' )
587+ return filename
588+
589+ except Exception as e :
590+ if verbose :
591+ print (f"Failed to process image { filename } : { e } " )
592+ # Fall back to simple copy
593+ shutil .copy (
594+ os .path .join (origin_folder , filename ),
595+ os .path .join (destination_folder , filename ),
596+ )
597+ return filename
598+
599+ elif compress_pdf and os .path .splitext (filename )[1 ].lower () == '.pdf' :
600+ _resize_pdf_figure (
601+ filename , origin_folder , destination_folder , pdf_resolution
602+ )
603+ return filename
604+ else :
605+ shutil .copy (
606+ os .path .join (origin_folder , filename ),
607+ os .path .join (destination_folder , filename ),
608+ )
609+ return filename
610+
611+
612+ def _update_image_references (tex_contents_dict , old_filename , new_filename , verbose = False ):
613+ """Update references from old_filename to new_filename in all tex content."""
614+ if old_filename == new_filename :
615+ return # No change needed
616+
617+ old_base = os .path .splitext (old_filename )[0 ]
618+ new_base = os .path .splitext (new_filename )[0 ]
619+
620+ if verbose :
621+ print (f"Updating LaTeX references: { old_filename } -> { new_filename } " )
622+
623+ for tex_file in tex_contents_dict :
624+ # Handle both string and list content
625+ if isinstance (tex_contents_dict [tex_file ], list ):
626+ content = '' .join (tex_contents_dict [tex_file ])
627+ else :
628+ content = tex_contents_dict [tex_file ]
629+
630+ content_changed = False
631+
632+ # Pattern 1: Direct filename with full extension, handling comments and newlines
633+ pattern1 = r'(\{(?:%\s*\n\s*)?[^}]*?)' + regex .escape (old_filename ) + r'((?:%\s*\n\s*)?[^}]*?\})'
634+ replacement1 = r'\1' + new_filename + r'\2'
635+
636+ new_content = regex .sub (pattern1 , replacement1 , content , flags = regex .IGNORECASE | regex .DOTALL )
637+ if new_content != content :
638+ content = new_content
639+ content_changed = True
640+ if verbose :
641+ print (f"Applied pattern 1 (full filename) in { tex_file } " )
642+ else :
643+ # Pattern 2: Base filename without extension, handling comments and newlines
644+ # Only apply this if Pattern 1 didn't match to avoid double replacements
645+ pattern2 = r'(\{(?:%\s*\n\s*)?[^}]*?)' + regex .escape (old_base ) + r'((?:%\s*\n\s*)?[^}]*?\})'
646+ replacement2 = r'\1' + new_base + r'.jpg\2'
647+
648+ new_content = regex .sub (pattern2 , replacement2 , content , flags = regex .IGNORECASE | regex .DOTALL )
649+ if new_content != content :
650+ content = new_content
651+ content_changed = True
652+ if verbose :
653+ print (f"Applied pattern 2 (base filename) in { tex_file } " )
654+ else :
655+ # Pattern 3: Handle cases where extension is split across lines with comments
656+ # This specifically targets patterns like: images/filename%\n.png
657+ pattern3 = r'(\{[^}]*?)' + regex .escape (old_base ) + r'(%\s*\n\s*)(\.png)([^}]*?\})'
658+ replacement3 = r'\1' + new_base + r'\2.jpg\4'
659+
660+ new_content = regex .sub (pattern3 , replacement3 , content , flags = regex .IGNORECASE | regex .DOTALL )
661+ if new_content != content :
662+ content = new_content
663+ content_changed = True
664+ if verbose :
665+ print (f"Applied pattern 3 (split extension) in { tex_file } " )
666+
667+ # Update the content back in the appropriate format
668+ if content_changed :
669+ if isinstance (tex_contents_dict [tex_file ], list ):
670+ # Convert back to list format, preserving line endings
671+ tex_contents_dict [tex_file ] = content .split ('\n ' )
672+ else :
673+ tex_contents_dict [tex_file ] = content
674+
675+ if verbose :
676+ print (f"Updated references in { tex_file } " )
677+
678+ # Re-write the updated tex files to the output directory
679+ if verbose and any (tex_contents_dict .values ()):
680+ print ("Re-writing updated tex files..." )
681+
682+ return tex_contents_dict
551683
552684
553685def _resize_pdf_figure (
@@ -575,26 +707,41 @@ def _copy_only_referenced_non_tex_not_in_root(parameters, contents, splits):
575707 ):
576708 _copy_file (fn , parameters )
577709
578-
579710def _resize_and_copy_figures_if_referenced (parameters , contents , splits ):
580- image_size = collections .defaultdict (lambda : parameters ['im_size' ])
581- image_size .update (parameters ['images_allowlist' ])
582- pdf_resolution = collections .defaultdict (
583- lambda : parameters ['pdf_im_resolution' ]
584- )
585- pdf_resolution .update (parameters ['images_allowlist' ])
586- for image_file in _keep_only_referenced (
587- splits ['figures' ], contents , strict = False
588- ):
589- _resize_and_copy_figure (
590- filename = image_file ,
591- origin_folder = parameters ['input_folder' ],
592- destination_folder = parameters ['output_folder' ],
593- resize_image = parameters ['resize_images' ],
594- image_size = image_size [image_file ],
595- compress_pdf = parameters ['compress_pdf' ],
596- pdf_resolution = pdf_resolution [image_file ],
711+ """Modified to handle PNG to JPG conversion and reference updates."""
712+ image_size = collections .defaultdict (lambda : parameters ['im_size' ])
713+ image_size .update (parameters ['images_allowlist' ])
714+ pdf_resolution = collections .defaultdict (
715+ lambda : parameters ['pdf_im_resolution' ]
597716 )
717+ pdf_resolution .update (parameters ['images_allowlist' ])
718+
719+ # contents is the full content string for reference checking
720+
721+ filename_changes = {} # Track PNG -> JPG filename changes
722+
723+ for image_file in _keep_only_referenced (
724+ splits ['figures' ], contents , strict = False
725+ ):
726+ actual_output_filename = _resize_and_copy_figure (
727+ filename = image_file ,
728+ origin_folder = parameters ['input_folder' ],
729+ destination_folder = parameters ['output_folder' ],
730+ resize_image = parameters ['resize_images' ],
731+ image_size = image_size [image_file ],
732+ compress_pdf = parameters ['compress_pdf' ],
733+ pdf_resolution = pdf_resolution [image_file ],
734+ convert_png_to_jpg = parameters .get ('convert_png_to_jpg' , False ),
735+ png_quality = parameters .get ('png_quality' , 50 ),
736+ png_size_threshold = parameters .get ('png_size_threshold' , 0.5 ),
737+ verbose = parameters .get ('verbose' , False )
738+ )
739+
740+ # Track filename changes for reference updates
741+ if actual_output_filename != image_file :
742+ filename_changes [image_file ] = actual_output_filename
743+
744+ return filename_changes
598745
599746
600747def _search_reference (filename , contents , strict = False ):
@@ -859,9 +1006,37 @@ def run_arxiv_cleaner(parameters):
8591006 logging .info ('Copying non-tex file %s.' , non_tex_file )
8601007 _copy_file (non_tex_file , parameters )
8611008
862- _resize_and_copy_figures_if_referenced (parameters , full_content , splits )
1009+ filename_changes = _resize_and_copy_figures_if_referenced (parameters , full_content , splits )
8631010 logging .info ('Outputs written to %s' , parameters ['output_folder' ])
8641011
1012+ # Update LaTeX references for changed filenames if tex_contents_dict is provided
1013+ if tex_contents and filename_changes :
1014+ for old_filename , new_filename in filename_changes .items ():
1015+ tex_contents = _update_image_references (
1016+ tex_contents , old_filename , new_filename ,
1017+ verbose = parameters .get ('verbose' , False )
1018+ )
1019+
1020+ # Re-write modified tex files with new references after resizing and copying figures
1021+ for tex_file in splits ['tex_to_copy' ]:
1022+ if tex_file in tex_contents :
1023+ # Get the updated content
1024+ if isinstance (tex_contents [tex_file ], list ):
1025+ updated_content = '' .join (tex_contents [tex_file ])
1026+ else :
1027+ updated_content = tex_contents [tex_file ]
1028+
1029+ # Write the updated content back to the output file
1030+ output_path = os .path .join (parameters ['output_folder' ], tex_file )
1031+ logging .info ('Re-writing modified tex file with updated references: %s' , output_path )
1032+ _write_file_content (updated_content , output_path )
1033+
1034+ if parameters .get ('verbose' , False ):
1035+ print (f"Re-wrote { tex_file } with updated image references" )
1036+
1037+ if parameters .get ('verbose' , False ):
1038+ print (f"Updated { len (filename_changes )} image references and re-wrote tex files" )
1039+
8651040
8661041def strip_whitespace (text ):
8671042 """Strips all whitespace characters.
0 commit comments