Skip to content

Commit e705b69

Browse files
authored
add feature convert png to jpg to reduce file size (#113)
* add feature convert png to jpg to reduce file size * update testcase for new feature png2jpg * update testcase for the new feature png2jpg
1 parent 0fac32f commit e705b69

55 files changed

Lines changed: 367 additions & 54 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ There is a 50MB limit on arXiv submissions, so to make it fit:
7373
* Optionally compresses `.pdf` files using ghostscript (Linux and Mac only).
7474
You can allowlist some PDFs to skip the global size using
7575
`images_allowlist`.
76+
* Optionally converts PNG images to JPG format to reduce file size.
7677

7778
#### TikZ picture source code concealment
7879

@@ -131,6 +132,9 @@ usage: arxiv_latex_cleaner@v1.0.8 [-h] [--resize_images] [--im_size IM_SIZE]
131132
[--if_exceptions IF_EXCEPTIONS [IF_EXCEPTIONS ...]]
132133
[--use_external_tikz USE_EXTERNAL_TIKZ]
133134
[--svg_inkscape [SVG_INKSCAPE]]
135+
[--convert_png_to_jpg]
136+
[--png_quality PNG_QUALITY]
137+
[--png_size_threshold PNG_SIZE_THRESHOLD]
134138
[--config CONFIG] [--verbose]
135139
input_folder
136140
@@ -207,6 +211,11 @@ optional arguments:
207211
(relative to the input folder) can be provided in case a
208212
different `inkscapepath` was set when loading the `svg`
209213
package.
214+
--convert_png_to_jpg Convert PNG images to JPG format to reduce file size
215+
--png_quality PNG_QUALITY
216+
JPG quality for PNG conversion (0-100, default: 50)
217+
--png_size_threshold PNG_SIZE_THRESHOLD
218+
Minimum PNG file size in MB to apply quality reduction (default: 0.5)
210219
--config CONFIG Read settings from `.yaml` config file. If command
211220
line arguments are provided additionally, the config
212221
file parameters are updated with the command line

arxiv_latex_cleaner/__main__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,26 @@ def if_prefixed(orig_string):
190190
),
191191
)
192192

193+
PARSER.add_argument(
194+
"--convert_png_to_jpg",
195+
action="store_true",
196+
help="Convert PNG images to JPG format to reduce file size. Note that this will override --resize_images for PNG files.",
197+
)
198+
199+
PARSER.add_argument(
200+
"--png_quality",
201+
type=int,
202+
default=50,
203+
help="JPG quality for PNG conversion (0-100, default: 50)",
204+
)
205+
206+
PARSER.add_argument(
207+
"--png_size_threshold",
208+
type=float,
209+
default=0.5,
210+
help="Minimum PNG file size in MB to apply quality reduction (default: 0.5)",
211+
)
212+
193213
PARSER.add_argument(
194214
"--config",
195215
type=str,

arxiv_latex_cleaner/arxiv_latex_cleaner.py

Lines changed: 224 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,6 @@ def repl_svg(matchobj):
508508

509509
return content
510510

511-
512511
def _resize_and_copy_figure(
513512
filename,
514513
origin_folder,
@@ -517,37 +516,170 @@ def _resize_and_copy_figure(
517516
image_size,
518517
compress_pdf,
519518
pdf_resolution,
519+
convert_png_to_jpg=False,
520+
png_quality=50,
521+
png_size_threshold=0.5,
522+
verbose=False
520523
):
521-
"""Resizes and copies the input figure (either JPG, PNG, or PDF)."""
522-
_create_dir_if_not_exists(
523-
os.path.join(destination_folder, os.path.dirname(filename))
524-
)
525-
526-
if resize_image and os.path.splitext(filename)[1].lower() in [
527-
'.jpg',
528-
'.jpeg',
529-
'.png',
530-
]:
531-
im = Image.open(os.path.join(origin_folder, filename))
532-
if max(im.size) > image_size:
533-
im = im.resize(
534-
tuple([int(x * float(image_size) / max(im.size)) for x in im.size]),
535-
Image.Resampling.LANCZOS,
536-
)
537-
if os.path.splitext(filename)[1].lower() in ['.jpg', '.jpeg']:
538-
im.save(os.path.join(destination_folder, filename), 'JPEG', quality=90)
539-
elif os.path.splitext(filename)[1].lower() in ['.png']:
540-
im.save(os.path.join(destination_folder, filename), 'PNG')
541-
542-
elif compress_pdf and os.path.splitext(filename)[1].lower() == '.pdf':
543-
_resize_pdf_figure(
544-
filename, origin_folder, destination_folder, pdf_resolution
545-
)
546-
else:
547-
shutil.copy(
548-
os.path.join(origin_folder, filename),
549-
os.path.join(destination_folder, filename),
524+
"""Resizes and copies the input figure (either JPG, PNG, or PDF).
525+
526+
Parameters:
527+
filename: The input filename
528+
origin_folder: The folder containing the input filename
529+
destination_folder: The folder to copy the output filename to
530+
resize_image: Whether to resize the image
531+
image_size: The maximum size of the image in pixels
532+
compress_pdf: Whether to compress the PDF file
533+
convert_png_to_jpg: Whether to convert PNG files to JPG format. Note that this will override resize_image for PNG files.
534+
png_quality: JPG quality for converted PNG files (0-100)
535+
png_size_threshold: Minimum file size in MB to apply quality reduction
536+
verbose: Enable verbose logging
537+
538+
Returns:
539+
str: The actual output filename (may differ from input if PNG was converted)
540+
"""
541+
_create_dir_if_not_exists(
542+
os.path.join(destination_folder, os.path.dirname(filename))
550543
)
544+
545+
if convert_png_to_jpg and os.path.splitext(filename)[1].lower() in ['.png']:
546+
original_size_mb = os.path.getsize(os.path.join(origin_folder, filename)) / (1024 * 1024)
547+
im = Image.open(os.path.join(origin_folder, filename))
548+
# Determine quality based on file size
549+
if original_size_mb < png_size_threshold:
550+
quality = 100 # Keep high quality for small files
551+
if verbose:
552+
print(f"Keeping original quality for small PNG: {filename}")
553+
else:
554+
quality = png_quality
555+
if verbose:
556+
print(f"Converting PNG to JPG with quality {quality}: {filename}")
557+
558+
# Convert PNG to JPG
559+
output_filename = os.path.splitext(filename)[0] + '.jpg'
560+
rgb_img = im.convert('RGB')
561+
rgb_img.save(os.path.join(destination_folder, output_filename), 'JPEG', quality=quality)
562+
563+
if verbose:
564+
print(f"Converted: {filename} -> {output_filename}")
565+
566+
return output_filename
567+
568+
if resize_image and os.path.splitext(filename)[1].lower() in [
569+
'.jpg',
570+
'.jpeg',
571+
'.png',
572+
]:
573+
try:
574+
im = Image.open(os.path.join(origin_folder, filename))
575+
if max(im.size) > image_size:
576+
im = im.resize(
577+
tuple([int(x * float(image_size) / max(im.size)) for x in im.size]),
578+
Image.Resampling.LANCZOS,
579+
)
580+
581+
if os.path.splitext(filename)[1].lower() in ['.jpg', '.jpeg']:
582+
im.save(os.path.join(destination_folder, filename), 'JPEG', quality=90)
583+
return filename
584+
585+
elif os.path.splitext(filename)[1].lower() in ['.png']:
586+
im.save(os.path.join(destination_folder, filename), 'PNG')
587+
return filename
588+
589+
except Exception as e:
590+
if verbose:
591+
print(f"Failed to process image {filename}: {e}")
592+
# Fall back to simple copy
593+
shutil.copy(
594+
os.path.join(origin_folder, filename),
595+
os.path.join(destination_folder, filename),
596+
)
597+
return filename
598+
599+
elif compress_pdf and os.path.splitext(filename)[1].lower() == '.pdf':
600+
_resize_pdf_figure(
601+
filename, origin_folder, destination_folder, pdf_resolution
602+
)
603+
return filename
604+
else:
605+
shutil.copy(
606+
os.path.join(origin_folder, filename),
607+
os.path.join(destination_folder, filename),
608+
)
609+
return filename
610+
611+
612+
def _update_image_references(tex_contents_dict, old_filename, new_filename, verbose=False):
613+
"""Update references from old_filename to new_filename in all tex content."""
614+
if old_filename == new_filename:
615+
return # No change needed
616+
617+
old_base = os.path.splitext(old_filename)[0]
618+
new_base = os.path.splitext(new_filename)[0]
619+
620+
if verbose:
621+
print(f"Updating LaTeX references: {old_filename} -> {new_filename}")
622+
623+
for tex_file in tex_contents_dict:
624+
# Handle both string and list content
625+
if isinstance(tex_contents_dict[tex_file], list):
626+
content = ''.join(tex_contents_dict[tex_file])
627+
else:
628+
content = tex_contents_dict[tex_file]
629+
630+
content_changed = False
631+
632+
# Pattern 1: Direct filename with full extension, handling comments and newlines
633+
pattern1 = r'(\{(?:%\s*\n\s*)?[^}]*?)' + regex.escape(old_filename) + r'((?:%\s*\n\s*)?[^}]*?\})'
634+
replacement1 = r'\1' + new_filename + r'\2'
635+
636+
new_content = regex.sub(pattern1, replacement1, content, flags=regex.IGNORECASE | regex.DOTALL)
637+
if new_content != content:
638+
content = new_content
639+
content_changed = True
640+
if verbose:
641+
print(f"Applied pattern 1 (full filename) in {tex_file}")
642+
else:
643+
# Pattern 2: Base filename without extension, handling comments and newlines
644+
# Only apply this if Pattern 1 didn't match to avoid double replacements
645+
pattern2 = r'(\{(?:%\s*\n\s*)?[^}]*?)' + regex.escape(old_base) + r'((?:%\s*\n\s*)?[^}]*?\})'
646+
replacement2 = r'\1' + new_base + r'.jpg\2'
647+
648+
new_content = regex.sub(pattern2, replacement2, content, flags=regex.IGNORECASE | regex.DOTALL)
649+
if new_content != content:
650+
content = new_content
651+
content_changed = True
652+
if verbose:
653+
print(f"Applied pattern 2 (base filename) in {tex_file}")
654+
else:
655+
# Pattern 3: Handle cases where extension is split across lines with comments
656+
# This specifically targets patterns like: images/filename%\n.png
657+
pattern3 = r'(\{[^}]*?)' + regex.escape(old_base) + r'(%\s*\n\s*)(\.png)([^}]*?\})'
658+
replacement3 = r'\1' + new_base + r'\2.jpg\4'
659+
660+
new_content = regex.sub(pattern3, replacement3, content, flags=regex.IGNORECASE | regex.DOTALL)
661+
if new_content != content:
662+
content = new_content
663+
content_changed = True
664+
if verbose:
665+
print(f"Applied pattern 3 (split extension) in {tex_file}")
666+
667+
# Update the content back in the appropriate format
668+
if content_changed:
669+
if isinstance(tex_contents_dict[tex_file], list):
670+
# Convert back to list format, preserving line endings
671+
tex_contents_dict[tex_file] = content.split('\n')
672+
else:
673+
tex_contents_dict[tex_file] = content
674+
675+
if verbose:
676+
print(f"Updated references in {tex_file}")
677+
678+
# Re-write the updated tex files to the output directory
679+
if verbose and any(tex_contents_dict.values()):
680+
print("Re-writing updated tex files...")
681+
682+
return tex_contents_dict
551683

552684

553685
def _resize_pdf_figure(
@@ -575,26 +707,41 @@ def _copy_only_referenced_non_tex_not_in_root(parameters, contents, splits):
575707
):
576708
_copy_file(fn, parameters)
577709

578-
579710
def _resize_and_copy_figures_if_referenced(parameters, contents, splits):
580-
image_size = collections.defaultdict(lambda: parameters['im_size'])
581-
image_size.update(parameters['images_allowlist'])
582-
pdf_resolution = collections.defaultdict(
583-
lambda: parameters['pdf_im_resolution']
584-
)
585-
pdf_resolution.update(parameters['images_allowlist'])
586-
for image_file in _keep_only_referenced(
587-
splits['figures'], contents, strict=False
588-
):
589-
_resize_and_copy_figure(
590-
filename=image_file,
591-
origin_folder=parameters['input_folder'],
592-
destination_folder=parameters['output_folder'],
593-
resize_image=parameters['resize_images'],
594-
image_size=image_size[image_file],
595-
compress_pdf=parameters['compress_pdf'],
596-
pdf_resolution=pdf_resolution[image_file],
711+
"""Modified to handle PNG to JPG conversion and reference updates."""
712+
image_size = collections.defaultdict(lambda: parameters['im_size'])
713+
image_size.update(parameters['images_allowlist'])
714+
pdf_resolution = collections.defaultdict(
715+
lambda: parameters['pdf_im_resolution']
597716
)
717+
pdf_resolution.update(parameters['images_allowlist'])
718+
719+
# contents is the full content string for reference checking
720+
721+
filename_changes = {} # Track PNG -> JPG filename changes
722+
723+
for image_file in _keep_only_referenced(
724+
splits['figures'], contents, strict=False
725+
):
726+
actual_output_filename = _resize_and_copy_figure(
727+
filename=image_file,
728+
origin_folder=parameters['input_folder'],
729+
destination_folder=parameters['output_folder'],
730+
resize_image=parameters['resize_images'],
731+
image_size=image_size[image_file],
732+
compress_pdf=parameters['compress_pdf'],
733+
pdf_resolution=pdf_resolution[image_file],
734+
convert_png_to_jpg=parameters.get('convert_png_to_jpg', False),
735+
png_quality=parameters.get('png_quality', 50),
736+
png_size_threshold=parameters.get('png_size_threshold', 0.5),
737+
verbose=parameters.get('verbose', False)
738+
)
739+
740+
# Track filename changes for reference updates
741+
if actual_output_filename != image_file:
742+
filename_changes[image_file] = actual_output_filename
743+
744+
return filename_changes
598745

599746

600747
def _search_reference(filename, contents, strict=False):
@@ -859,9 +1006,37 @@ def run_arxiv_cleaner(parameters):
8591006
logging.info('Copying non-tex file %s.', non_tex_file)
8601007
_copy_file(non_tex_file, parameters)
8611008

862-
_resize_and_copy_figures_if_referenced(parameters, full_content, splits)
1009+
filename_changes = _resize_and_copy_figures_if_referenced(parameters, full_content, splits)
8631010
logging.info('Outputs written to %s', parameters['output_folder'])
8641011

1012+
# Update LaTeX references for changed filenames if tex_contents_dict is provided
1013+
if tex_contents and filename_changes:
1014+
for old_filename, new_filename in filename_changes.items():
1015+
tex_contents = _update_image_references(
1016+
tex_contents, old_filename, new_filename,
1017+
verbose=parameters.get('verbose', False)
1018+
)
1019+
1020+
# Re-write modified tex files with new references after resizing and copying figures
1021+
for tex_file in splits['tex_to_copy']:
1022+
if tex_file in tex_contents:
1023+
# Get the updated content
1024+
if isinstance(tex_contents[tex_file], list):
1025+
updated_content = ''.join(tex_contents[tex_file])
1026+
else:
1027+
updated_content = tex_contents[tex_file]
1028+
1029+
# Write the updated content back to the output file
1030+
output_path = os.path.join(parameters['output_folder'], tex_file)
1031+
logging.info('Re-writing modified tex file with updated references: %s', output_path)
1032+
_write_file_content(updated_content, output_path)
1033+
1034+
if parameters.get('verbose', False):
1035+
print(f"Re-wrote {tex_file} with updated image references")
1036+
1037+
if parameters.get('verbose', False):
1038+
print(f"Updated {len(filename_changes)} image references and re-wrote tex files")
1039+
8651040

8661041
def strip_whitespace(text):
8671042
"""Strips all whitespace characters.

0 commit comments

Comments
 (0)