@@ -114,6 +114,58 @@ def parse_range(range_str: str) -> Tuple[int, int]:
114114 return (start , end )
115115
116116
117+ def cmd_list_dirs (args : argparse .Namespace ) -> int :
118+ """Execute the list-dirs command (fast directory listing).
119+
120+ Args:
121+ args: Parsed command-line arguments
122+
123+ Returns:
124+ Exit code (0 for success)
125+ """
126+ from src .bmlibrarian .importers .europe_pmc_pdf_downloader import EuropePMCPDFDownloader
127+ import requests
128+
129+ print ("=" * 70 )
130+ print ("Europe PMC PDF Directory Listing" )
131+ print ("=" * 70 )
132+
133+ try :
134+ with EuropePMCPDFDownloader (output_dir = Path (args .output_dir )) as downloader :
135+ directories = downloader .list_directories ()
136+
137+ print (f"\n Found { len (directories )} PDF directories" )
138+ print ("-" * 70 )
139+
140+ # Show first 30 directories
141+ for dir_name in directories [:30 ]:
142+ print (f" { dir_name } /" )
143+
144+ if len (directories ) > 30 :
145+ print (f" ... and { len (directories ) - 30 } more" )
146+
147+ print ("-" * 70 )
148+ print (f"Total directories: { len (directories )} " )
149+ print ("\n Note: Each directory contains individual PMC#######.zip files." )
150+ print ("Use 'list --max-dirs N' to scan contents of first N directories." )
151+ print ("=" * 70 )
152+
153+ return 0
154+
155+ except requests .exceptions .RequestException as e :
156+ logging .error (f"Network error: { e } " )
157+ print (f"Network error while contacting Europe PMC. Please check your connection." )
158+ return 1
159+ except PermissionError as e :
160+ logging .error (f"Permission error: { e } " )
161+ print (f"Permission denied. Check that you have write access to: { args .output_dir } " )
162+ return 1
163+ except OSError as e :
164+ logging .error (f"File system error: { e } " )
165+ print (f"File system error: { e } " )
166+ return 1
167+
168+
117169def cmd_list (args : argparse .Namespace ) -> int :
118170 """Execute the list command.
119171
@@ -130,6 +182,13 @@ def cmd_list(args: argparse.Namespace) -> int:
130182 print ("Europe PMC Open Access PDF Package Listing" )
131183 print ("=" * 70 )
132184
185+ if args .max_dirs :
186+ print (f"Scanning first { args .max_dirs } directories..." )
187+ else :
188+ print ("WARNING: Scanning all directories may take a long time!" )
189+ print ("Use --max-dirs N to limit scanning scope." )
190+ print ("=" * 70 )
191+
133192 pmcid_ranges = None
134193 if args .range :
135194 try :
@@ -143,8 +202,17 @@ def cmd_list(args: argparse.Namespace) -> int:
143202 output_dir = Path (args .output_dir ),
144203 pmcid_ranges = pmcid_ranges
145204 ) as downloader :
146- packages = downloader .list_available_packages (refresh = args .refresh )
205+ # Progress callback for directory scanning
206+ def progress_callback (dir_name : str , current : int , total : int ) -> None :
207+ print (f" Scanning [{ current } /{ total } ] { dir_name } ..." , end = '\r ' )
147208
209+ packages = downloader .list_available_packages (
210+ refresh = args .refresh ,
211+ max_directories = args .max_dirs ,
212+ progress_callback = progress_callback
213+ )
214+
215+ print () # Clear the progress line
148216 print (f"\n Found { len (packages )} PDF packages" )
149217 print ("-" * 70 )
150218
@@ -219,6 +287,8 @@ def cmd_download(args: argparse.Namespace) -> int:
219287 print (f"Output directory: { args .output_dir } " )
220288 if args .limit :
221289 print (f"Limit: { args .limit } packages" )
290+ if args .max_dirs :
291+ print (f"Max directories: { args .max_dirs } " )
222292 print (f"Delay between files: { args .delay } seconds" )
223293 print (f"Extract PDFs: { not args .no_extract } " )
224294 print ("=" * 70 )
@@ -240,8 +310,9 @@ def cmd_download(args: argparse.Namespace) -> int:
240310 max_retries = args .max_retries ,
241311 extract_pdfs = not args .no_extract
242312 ) as downloader :
243- # List packages first
244- packages = downloader .list_available_packages ()
313+ # List packages first (with max_dirs limit if specified)
314+ max_dirs = getattr (args , 'max_dirs' , None )
315+ packages = downloader .list_available_packages (max_directories = max_dirs )
245316 print (f"\n Found { len (packages )} PDF packages available" )
246317
247318 # Download with progress callback
@@ -553,6 +624,18 @@ def main() -> int:
553624
554625 subparsers = parser .add_subparsers (dest = 'command' , help = 'Command to execute' )
555626
627+ # List-dirs command (fast)
628+ list_dirs_parser = subparsers .add_parser (
629+ 'list-dirs' ,
630+ help = 'List available PDF directories (fast, no package scanning)'
631+ )
632+ list_dirs_parser .add_argument (
633+ '--output-dir' ,
634+ type = str ,
635+ default = DEFAULT_OUTPUT_DIR ,
636+ help = f'Output directory (default: { DEFAULT_OUTPUT_DIR } )'
637+ )
638+
556639 # List command
557640 list_parser = subparsers .add_parser (
558641 'list' ,
@@ -569,6 +652,11 @@ def main() -> int:
569652 type = str ,
570653 help = f'PMCID range filter (e.g., "1-1000000", valid: { MIN_PMCID } -{ MAX_PMCID } )'
571654 )
655+ list_parser .add_argument (
656+ '--max-dirs' ,
657+ type = int ,
658+ help = 'Maximum number of directories to scan (recommended: start with 1-5)'
659+ )
572660 list_parser .add_argument (
573661 '--refresh' ,
574662 action = 'store_true' ,
@@ -591,6 +679,11 @@ def main() -> int:
591679 type = int ,
592680 help = 'Maximum number of packages to download'
593681 )
682+ download_parser .add_argument (
683+ '--max-dirs' ,
684+ type = int ,
685+ help = 'Maximum number of directories to scan for packages'
686+ )
594687 download_parser .add_argument (
595688 '--delay' ,
596689 type = int ,
@@ -691,7 +784,9 @@ def main() -> int:
691784 setup_logging (args .verbose )
692785
693786 # Execute command
694- if args .command == 'list' :
787+ if args .command == 'list-dirs' :
788+ return cmd_list_dirs (args )
789+ elif args .command == 'list' :
695790 return cmd_list (args )
696791 elif args .command == 'download' :
697792 return cmd_download (args )
0 commit comments