I'm working on a Python script that exports every message ID present in my Gmail account along with their added labels (full label path) into a TXT file.
The script itself works, but the export speed is around 2 messages per second. For smaller accounts, this is fine, but for larger accounts, the export can easily take days, and in some cases, weeks.
Is there a way to increase the processing speed of the script, or is the limitation coming from Google itself?
def get_label_path(service, label_id):
label = service.users().labels().get(userId='me', id=label_id).execute()
label_path = label['name']
while 'parent' in label:
label = service.users().labels().get(userId='me', id=label['parent']).execute()
label_path = label['name'] + '/' + label_path
return label_path
def get_message_ids_with_labels(service):
profile = service.users().getProfile(userId='me').execute()
page_token = None
with open(OUTPUT_FILE_PATH, 'w') as output_file:
while True:
results = service.users().messages().list(userId='me', pageToken=page_token).execute()
messages = results.get('messages', [])
if not messages:
break
for message in messages:
message_id = message['id']
msg = get_message(service, message_id)
headers = msg['payload']['headers']
message_id = next((header['value'] for header in headers if header['name'].lower() == 'message-id'), None)
label_ids = msg.get('labelIds', [])
labels = [get_label_path(service, label_id) for label_id in label_ids]
output_file.write(f"Message-ID: {message_id} - Labels: {labels}\n")
page_token = results.get('nextPageToken')
if not page_token:
break
I believe your goal is as follows.
In your situation, how about the following flow?
When this flow is reflected in the modified script, it becomes as follows.
Please set OUTPUT_FILE_PATH
.
OUTPUT_FILE_PATH = "sample.txt" # Please set your output filename.
ar = []
def sample(id, res, err):
# print(id)
# print(err)
ar.append([res["id"], res.get("labelIds", [])])
def get_labels(service):
obj = service.users().labels().list(userId='me').execute()
labels = obj.get('labels', [])
labelObj = {}
for e in labels:
labelObj[e["id"]] = e["name"]
return labelObj
def get_message_ids_with_labels(service):
# Retrieve label list as an object.
labelObj = get_labels(service)
# Retrieve all message IDs.
messageIds = []
page_token = ""
while page_token is not None:
obj = service.users().messages().list(userId='me', pageToken=page_token, maxResults=500).execute()
messages = [e["id"] for e in obj.get('messages', [])]
messageIds += messages
page_token = obj.get("nextPageToken")
print(f"Total message IDs: {len(messageIds)}")
# Retrieve label ids from message IDs.
for i in range(0, len(messageIds), 100):
batchIds = messageIds[i:i+100]
print(f"Processing from {i} to {i + len(batchIds)}")
batch = service.new_batch_http_request(callback=sample)
for messageId in batchIds:
batch.add(service.users().messages().get(userId='me', id=messageId, fields="id,labelIds"))
batch.execute()
# Create result texts using the label list object and label IDs.
arr = []
for e in ar:
labelNames = []
for f in e[1]:
labelNames.append(labelObj[f])
arr.append(f"Message-ID: {e[0]} - Labels: {','.join(labelNames)}")
res = "\n".join(arr)
# Write the result texts into a file.
with open(OUTPUT_FILE_PATH, 'w') as output_file:
output_file.write(res)
print("Done")
In this script, please call a function get_message_ids_with_labels(service)
. service
is a client for using Gmail API.
When this script is run, a text file including Message-ID: ### - Labels: ###
is created using the above flow.
service
can be used for using Gmail API. Please be careful about this.From your following reply,
there is a huge speed increase. There was just one issue, as I need the "Message ID" from the metadataHeaders[]",
I couldn't notice that you wanted to retrieve the value of Message-ID
in the header from your question. I thought that you wanted to retrieve the message ID of Gmail. In the case of the value of Message-ID
in the header, how about the following sample script? The above sample script was modified.
OUTPUT_FILE_PATH = "sample.txt" # Please set your output filename.
ar = []
def sample(id, res, err):
# print(id)
# print(err)
ar.append([res["payload"]["headers"][0]["value"], res.get("labelIds", [])])
def get_labels(service):
obj = service.users().labels().list(userId='me').execute()
labels = obj.get('labels', [])
labelObj = {}
for e in labels:
labelObj[e["id"]] = e["name"]
return labelObj
def get_message_ids_with_labels(service):
# Retrieve label list as an object.
labelObj = get_labels(service)
# Retrieve all message IDs.
messageIds = []
page_token = ""
while page_token is not None:
obj = service.users().messages().list(userId='me', pageToken=page_token, maxResults=500).execute()
messages = [e["id"] for e in obj.get('messages', [])]
messageIds += messages
page_token = obj.get("nextPageToken")
print(f"Total message IDs: {len(messageIds)}")
# Retrieve label ids from message IDs.
for i in range(0, len(messageIds), 100):
batchIds = messageIds[i:i+100]
print(f"Processing from {i} to {i + len(batchIds)}")
batch = service.new_batch_http_request(callback=sample)
for messageId in batchIds:
batch.add(service.users().messages().get(userId='me', id=messageId, format='metadata', metadataHeaders=['Message-ID']))
batch.execute()
# Create result texts using the label list object and label IDs.
arr = []
for e in ar:
labelNames = []
for f in e[1]:
labelNames.append(labelObj[f])
arr.append(f"Message-ID: {e[0]} - Labels: {','.join(labelNames)}")
res = "\n".join(arr)
# Write the result texts into a file.
with open(OUTPUT_FILE_PATH, 'w') as output_file:
output_file.write(res)
print("Done")
Message-ID
in the mail header and the labels are retrieved.About your following reply,
But I received this error "line 40, in sample ar.append([res["payload"]["headers"][0]["value"], res.get("labelIds", [])]) ~~~^^^^^^^^^^^ TypeError: 'NoneType' object is not subscriptable". I assume it is related to messages without label. I assumed when there is no label the output would be "... - Labels: "
In this case, please modify the above script as follows.
def sample(id, res, err):
# print(id)
# print(err)
ar.append([res["payload"]["headers"][0]["value"], res.get("labelIds", [])])
def sample(id, res, err):
# print(id)
# print(err)
labelIds = res.get("labelIds", [])
if "headers" not in res["payload"] or res["payload"]["headers"] is None or len(res["payload"]["headers"]) == 0 or "name" not in res["payload"]["headers"][0] or res["payload"]["headers"][0]["name"] != "Message-ID":
ar.append(["No Message-ID", [] if labelIds is None else labelIds])
else:
ar.append([res["payload"]["headers"][0]["value"], [] if labelIds is None else labelIds])