Skip to content

Commit 04d250c

Browse files
hherbclaude
andcommitted
Fix Europe PMC PDF downloader to use correct FTP structure
The downloader was looking at the wrong URL (/ftp/oa/) which only contains XML metadata files. The actual PDFs are at /ftp/pdf/ with a different structure: - Top-level directories: PMCxxxx000/, PMCxxxx001/, etc. (1000 dirs) - Inside each: Individual PMC#######.zip files (not tar.gz) - Each zip contains a single PDF for that PMCID Changes: - Update EUROPE_PMC_PDF_BASE_URL from /ftp/oa/ to /ftp/pdf/ - Rewrite list_available_packages() for two-level directory structure - Add list_directories() for fast directory-only listing - Change archive handling from tarfile to zipfile module - Add max_directories parameter to limit scanning scope - Add --max-dirs CLI option for list and download commands - Add list-dirs CLI command for fast directory listing - Add progress callback for directory scanning 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 0ad2328 commit 04d250c

2 files changed

Lines changed: 302 additions & 134 deletions

File tree

europe_pmc_pdf_cli.py

Lines changed: 99 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,58 @@ def parse_range(range_str: str) -> Tuple[int, int]:
114114
return (start, end)
115115

116116

117+
def cmd_list_dirs(args: argparse.Namespace) -> int:
118+
"""Execute the list-dirs command (fast directory listing).
119+
120+
Args:
121+
args: Parsed command-line arguments
122+
123+
Returns:
124+
Exit code (0 for success)
125+
"""
126+
from src.bmlibrarian.importers.europe_pmc_pdf_downloader import EuropePMCPDFDownloader
127+
import requests
128+
129+
print("=" * 70)
130+
print("Europe PMC PDF Directory Listing")
131+
print("=" * 70)
132+
133+
try:
134+
with EuropePMCPDFDownloader(output_dir=Path(args.output_dir)) as downloader:
135+
directories = downloader.list_directories()
136+
137+
print(f"\nFound {len(directories)} PDF directories")
138+
print("-" * 70)
139+
140+
# Show first 30 directories
141+
for dir_name in directories[:30]:
142+
print(f" {dir_name}/")
143+
144+
if len(directories) > 30:
145+
print(f" ... and {len(directories) - 30} more")
146+
147+
print("-" * 70)
148+
print(f"Total directories: {len(directories)}")
149+
print("\nNote: Each directory contains individual PMC#######.zip files.")
150+
print("Use 'list --max-dirs N' to scan contents of first N directories.")
151+
print("=" * 70)
152+
153+
return 0
154+
155+
except requests.exceptions.RequestException as e:
156+
logging.error(f"Network error: {e}")
157+
print(f"Network error while contacting Europe PMC. Please check your connection.")
158+
return 1
159+
except PermissionError as e:
160+
logging.error(f"Permission error: {e}")
161+
print(f"Permission denied. Check that you have write access to: {args.output_dir}")
162+
return 1
163+
except OSError as e:
164+
logging.error(f"File system error: {e}")
165+
print(f"File system error: {e}")
166+
return 1
167+
168+
117169
def cmd_list(args: argparse.Namespace) -> int:
118170
"""Execute the list command.
119171
@@ -130,6 +182,13 @@ def cmd_list(args: argparse.Namespace) -> int:
130182
print("Europe PMC Open Access PDF Package Listing")
131183
print("=" * 70)
132184

185+
if args.max_dirs:
186+
print(f"Scanning first {args.max_dirs} directories...")
187+
else:
188+
print("WARNING: Scanning all directories may take a long time!")
189+
print("Use --max-dirs N to limit scanning scope.")
190+
print("=" * 70)
191+
133192
pmcid_ranges = None
134193
if args.range:
135194
try:
@@ -143,8 +202,17 @@ def cmd_list(args: argparse.Namespace) -> int:
143202
output_dir=Path(args.output_dir),
144203
pmcid_ranges=pmcid_ranges
145204
) as downloader:
146-
packages = downloader.list_available_packages(refresh=args.refresh)
205+
# Progress callback for directory scanning
206+
def progress_callback(dir_name: str, current: int, total: int) -> None:
207+
print(f" Scanning [{current}/{total}] {dir_name}...", end='\r')
147208

209+
packages = downloader.list_available_packages(
210+
refresh=args.refresh,
211+
max_directories=args.max_dirs,
212+
progress_callback=progress_callback
213+
)
214+
215+
print() # Clear the progress line
148216
print(f"\nFound {len(packages)} PDF packages")
149217
print("-" * 70)
150218

@@ -219,6 +287,8 @@ def cmd_download(args: argparse.Namespace) -> int:
219287
print(f"Output directory: {args.output_dir}")
220288
if args.limit:
221289
print(f"Limit: {args.limit} packages")
290+
if args.max_dirs:
291+
print(f"Max directories: {args.max_dirs}")
222292
print(f"Delay between files: {args.delay} seconds")
223293
print(f"Extract PDFs: {not args.no_extract}")
224294
print("=" * 70)
@@ -240,8 +310,9 @@ def cmd_download(args: argparse.Namespace) -> int:
240310
max_retries=args.max_retries,
241311
extract_pdfs=not args.no_extract
242312
) as downloader:
243-
# List packages first
244-
packages = downloader.list_available_packages()
313+
# List packages first (with max_dirs limit if specified)
314+
max_dirs = getattr(args, 'max_dirs', None)
315+
packages = downloader.list_available_packages(max_directories=max_dirs)
245316
print(f"\nFound {len(packages)} PDF packages available")
246317

247318
# Download with progress callback
@@ -553,6 +624,18 @@ def main() -> int:
553624

554625
subparsers = parser.add_subparsers(dest='command', help='Command to execute')
555626

627+
# List-dirs command (fast)
628+
list_dirs_parser = subparsers.add_parser(
629+
'list-dirs',
630+
help='List available PDF directories (fast, no package scanning)'
631+
)
632+
list_dirs_parser.add_argument(
633+
'--output-dir',
634+
type=str,
635+
default=DEFAULT_OUTPUT_DIR,
636+
help=f'Output directory (default: {DEFAULT_OUTPUT_DIR})'
637+
)
638+
556639
# List command
557640
list_parser = subparsers.add_parser(
558641
'list',
@@ -569,6 +652,11 @@ def main() -> int:
569652
type=str,
570653
help=f'PMCID range filter (e.g., "1-1000000", valid: {MIN_PMCID}-{MAX_PMCID})'
571654
)
655+
list_parser.add_argument(
656+
'--max-dirs',
657+
type=int,
658+
help='Maximum number of directories to scan (recommended: start with 1-5)'
659+
)
572660
list_parser.add_argument(
573661
'--refresh',
574662
action='store_true',
@@ -591,6 +679,11 @@ def main() -> int:
591679
type=int,
592680
help='Maximum number of packages to download'
593681
)
682+
download_parser.add_argument(
683+
'--max-dirs',
684+
type=int,
685+
help='Maximum number of directories to scan for packages'
686+
)
594687
download_parser.add_argument(
595688
'--delay',
596689
type=int,
@@ -691,7 +784,9 @@ def main() -> int:
691784
setup_logging(args.verbose)
692785

693786
# Execute command
694-
if args.command == 'list':
787+
if args.command == 'list-dirs':
788+
return cmd_list_dirs(args)
789+
elif args.command == 'list':
695790
return cmd_list(args)
696791
elif args.command == 'download':
697792
return cmd_download(args)

0 commit comments

Comments
 (0)