99
1010from googleapiclient .discovery import Resource # type: ignore
1111from googleapiclient .errors import HttpError # type: ignore
12+ from googleapiclient .http import BatchHttpRequest # type: ignore
1213
1314from onyx .access .models import ExternalAccess
1415from onyx .connectors .google_drive .constants import DRIVE_FOLDER_TYPE
@@ -60,6 +61,8 @@ class DriveFileFieldType(Enum):
6061)
6162FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
6263
64+ MAX_BATCH_SIZE = 100
65+
6366HIERARCHY_FIELDS = "id, name, parents, webViewLink, mimeType, driveId"
6467
6568HIERARCHY_FIELDS_WITH_PERMISSIONS = (
@@ -216,7 +219,7 @@ def get_external_access_for_folder(
216219
217220
218221def _get_fields_for_file_type (field_type : DriveFileFieldType ) -> str :
219- """Get the appropriate fields string based on the field type enum"""
222+ """Get the appropriate fields string for files().list() based on the field type enum. """
220223 if field_type == DriveFileFieldType .SLIM :
221224 return SLIM_FILE_FIELDS
222225 elif field_type == DriveFileFieldType .WITH_PERMISSIONS :
@@ -225,6 +228,25 @@ def _get_fields_for_file_type(field_type: DriveFileFieldType) -> str:
225228 return FILE_FIELDS
226229
227230
231+ def _extract_single_file_fields (list_fields : str ) -> str :
232+ """Convert a files().list() fields string to one suitable for files().get().
233+
234+ List fields look like "nextPageToken, files(field1, field2, ...)"
235+ Single-file fields should be just "field1, field2, ..."
236+ """
237+ start = list_fields .find ("files(" )
238+ if start == - 1 :
239+ return list_fields
240+ inner_start = start + len ("files(" )
241+ inner_end = list_fields .rfind (")" )
242+ return list_fields [inner_start :inner_end ]
243+
244+
245+ def _get_single_file_fields (field_type : DriveFileFieldType ) -> str :
246+ """Get the appropriate fields string for files().get() based on the field type enum."""
247+ return _extract_single_file_fields (_get_fields_for_file_type (field_type ))
248+
249+
228250def _get_files_in_parent (
229251 service : Resource ,
230252 parent_id : str ,
@@ -536,3 +558,74 @@ def get_file_by_web_view_link(
536558 )
537559 .execute ()
538560 )
561+
562+
563+ class BatchRetrievalResult :
564+ """Result of a batch file retrieval, separating successes from errors."""
565+
566+ def __init__ (self ) -> None :
567+ self .files : dict [str , GoogleDriveFileType ] = {}
568+ self .errors : dict [str , Exception ] = {}
569+
570+
571+ def get_files_by_web_view_links_batch (
572+ service : GoogleDriveService ,
573+ web_view_links : list [str ],
574+ field_type : DriveFileFieldType ,
575+ ) -> BatchRetrievalResult :
576+ """Retrieve multiple Google Drive files by webViewLink using the batch API.
577+
578+ Returns a BatchRetrievalResult containing successful file retrievals
579+ and errors for any files that could not be fetched.
580+ Automatically splits into chunks of MAX_BATCH_SIZE.
581+ """
582+ fields = _get_single_file_fields (field_type )
583+ if len (web_view_links ) <= MAX_BATCH_SIZE :
584+ return _get_files_by_web_view_links_batch (service , web_view_links , fields )
585+
586+ combined = BatchRetrievalResult ()
587+ for i in range (0 , len (web_view_links ), MAX_BATCH_SIZE ):
588+ chunk = web_view_links [i : i + MAX_BATCH_SIZE ]
589+ chunk_result = _get_files_by_web_view_links_batch (service , chunk , fields )
590+ combined .files .update (chunk_result .files )
591+ combined .errors .update (chunk_result .errors )
592+ return combined
593+
594+
595+ def _get_files_by_web_view_links_batch (
596+ service : GoogleDriveService ,
597+ web_view_links : list [str ],
598+ fields : str ,
599+ ) -> BatchRetrievalResult :
600+ """Single-batch implementation."""
601+
602+ result = BatchRetrievalResult ()
603+
604+ def callback (
605+ request_id : str ,
606+ response : GoogleDriveFileType ,
607+ exception : Exception | None ,
608+ ) -> None :
609+ if exception :
610+ logger .warning (f"Error retrieving file { request_id } : { exception } " )
611+ result .errors [request_id ] = exception
612+ else :
613+ result .files [request_id ] = response
614+
615+ batch = cast (BatchHttpRequest , service .new_batch_http_request (callback = callback ))
616+
617+ for web_view_link in web_view_links :
618+ try :
619+ file_id = _extract_file_id_from_web_view_link (web_view_link )
620+ request = service .files ().get (
621+ fileId = file_id ,
622+ supportsAllDrives = True ,
623+ fields = fields ,
624+ )
625+ batch .add (request , request_id = web_view_link )
626+ except ValueError as e :
627+ logger .warning (f"Failed to extract file ID from { web_view_link } : { e } " )
628+ result .errors [web_view_link ] = e
629+
630+ batch .execute ()
631+ return result
0 commit comments