How can I utilize a Python script within a Google Cloud Function to efficiently transfer files from one GCP bucket to another? The source bucket (let's call it "bucket1") contains a complex directory structure with various folders and subfolders, such as
bucket1/folder/subfolder1/file_abc.csv
bucket2/folder/subfolder2/file_def.csv
bucket3/folder/subfolder3/file_ghi.csv
My goal is to selectively move files that are older than three months to a target bucket ("bucket2"). If a file meets the age condition, I'd like to compress it into a ZIP archive before transferring it to the destination bucket. I've already attempted the code below:
from google.cloud import storage
import zipfile
import datetime
import re
import os
def move_files(request, context):
source_bucket_name = 'bucket1'
target_bucket_name = 'bucket2'
storage_client = storage.Client()
source_bucket = storage_client.bucket(source_bucket_name)
target_bucket = storage_client.bucket(target_bucket_name)
get_old_date = (datetime.datetime.now() - datetime.timedelta(days=90)).date()
object_pattern = [re.compile(r'folder/[^/]+/filename.*')]
all_blobs = storage_client.list_blobs(source_bucket)
filtered_blobs = [b for b in all_blobs if any(pattern.match(b.name) for pattern in object_pattern)]
temp_dir = '/tmp/zipped_files'
os.makedirs(temp_dir, exist_ok=True)
for blob in filtered_blobs:
blob_created_time = blob.time_created
if blob_created_time.date() < get_old_date:
destination_path = os.path.join(temp_dir, os.path.basename(blob.name))
blob.download_to_filename(destination_path)
if os.listdir(temp_dir):
zip_file_path = '/tmp/zipped_files.zip'
with zipfile.ZipFile(zip_file_path, 'w') as zipf:
for root, _, files in os.walk(temp_dir):
for file in files:
zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), temp_dir))
target_blob_name = f'archive_{get_old_date}.zip'
target_blob = target_bucket.blob(target_blob_name)
target_blob.upload_from_filename(zip_file_path)
os.remove(zip_file_path)
for file in os.listdir(temp_dir):
os.remove(os.path.join(temp_dir, file))
os.rmdir(temp_dir)
print(f'Successfully moved and zipped files to {target_blob_name} in {target_bucket_name} bucket.')
else:
print('No files to move and zip.')
for blob in filtered_blobs:
blob.delete()
print("Successful")
This code deleting my files from the source bucket but now moving any files to the target bucket. Can someone please help me to resolve this?
The object\_pattern
you defined to match file names seems to be missing the actual regular expression. Modify it like this:
object_pattern = [re.compile(r'folder/[^/]+/file_.*\.csv')]
This will match file names like folder/subfolder1/file\_abc.csv
in the source bucket.
In your loop, when downloading files to destination\_path
, you need to create a ZIP archive before downloading:
if blob_created_time.date() < get_old_date:
destination_path = os.path.join(temp_dir, os.path.basename(blob.name))
blob.download_to_filename(destination_path)
with zipfile.ZipFile(zip_file_path, 'a') as zipf: # Open the existing zip file in 'append' mode and create a new entry in the zip archive
zipf.write(destination_path, os.path.basename(blob.name)) # Write the downloaded file into the zip archive, using the blob's base name as the file name within the archive
os.remove(destination_path) # Remove the downloaded file from the temporary location
This will add the file to a ZIP archive and remove the original.
References: