I am trying to replace dates at the start of each block of my document, and I have a currently working PoC, but it feels sluggish. I think the problem is that I am using search when I know that the text to be replaced is in the beginning of the block, but I can't work out how I can select it.
from PyQt6 import QtWidgets, QtGui
import time
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
# Create a plain text edit widget and add some text to it
self.text_edit = QtWidgets.QPlainTextEdit()
text = ''
for i in range(10000):
text+="2022 something or the other\n2022 some other test\n"
self.text_edit.setPlainText(text)
self.setCentralWidget(self.text_edit)
# Create a button and connect its clicked signal to the select_text function
self.button = QtWidgets.QPushButton("Change Text")
self.button.clicked.connect(self.select_text)
toolbar = self.addToolBar("Toolbar")
toolbar.addWidget(self.button)
def select_text(self):
old = ["2022"] * 20000
new = ["2023"] * 20000
start_time = time.perf_counter()
cursor = self.text_edit.textCursor()
cursor.beginEditBlock()
for i in range(self.text_edit.document().blockCount()):
block = self.text_edit.document().findBlockByNumber(i)
# Search for a specific string within the block
block_cursor = QtGui.QTextCursor(block)
# Get a QTextCursor object for the block
while not block_cursor.atEnd() and block_cursor.block() == block:
# block_cursor.movePosition(QtGui.QTextCursor.Right, QtGui.QTextCursor.KeepAnchor, 4)
block_cursor = block.document().find(old[i], block_cursor)
if not block_cursor.isNull():
block_cursor.insertText(new[i])
cursor.endEditBlock()
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
if __name__ == "__main__":
app = QtWidgets.QApplication([])
window = MainWindow()
window.show()
app.exec()
This is my current code, working as required, but I hope that selecting with move position instead of searching will speed it up.
Edit: Based on musicamante's answer, the improved code:
def select_text(self):
old = ["2022"] * 20000
new = ["2023"] * 20000
start_time = time.perf_counter()
cursor = self.text_edit.textCursor()
cursor.beginEditBlock()
i=0
doc = self.text_edit.document()
find_cursor = QtGui.QTextCursor(doc.begin())
while True:
find_cursor = doc.find(old[i], find_cursor)
if not find_cursor.isNull():
find_cursor.insertText(new[i])
i=i+1
if i == 20000:
break
else:
break
cursor.endEditBlock()
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Elapsed time: {:.2f} seconds".format(elapsed_time))
Edit2: The goal of the application is that I have timestamps with unknown time zone settings, and instead of asking for the time zone, I have added a possible UTC offset to shift the time to UTC (when \ if needed). I have reworked the example and the approach:
from PyQt6 import QtWidgets, QtGui
import time
import datetime
class MainWindow(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
# Create a plain text edit widget and add some text to it
self.text_edit = QtWidgets.QPlainTextEdit()
text = "2022-08-02T15:41:05.000 something or the other\n2022-08-02T15:41:06.000 Some parts may contain timestamps 2021-08-02T15:42:06.000 or\u2028New lines within a block\n2022-08-02T15:42:06.000 some other test"
self.text_edit.setPlainText(text)
self.setCentralWidget(self.text_edit)
# Create a button and connect its clicked signal to the select_text function
self.button = QtWidgets.QPushButton("Change Text")
self.button.clicked.connect(self.shift_timezone)
toolbar = self.addToolBar("Toolbar")
toolbar.addWidget(self.button)
def shift_timezone(self):
text_timestamps = [1659447665, 1659447666, 1659447726]
start_time = time.perf_counter()
cursor = self.text_edit.textCursor()
cursor.movePosition(cursor.MoveOperation.Start, cursor.MoveMode.MoveAnchor)
cursor.beginEditBlock()
i=0
while i<self.text_edit.document().blockCount():
dt = datetime.datetime.fromtimestamp(text_timestamps[i]+3600)
iso_string = dt.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z'
cursor.movePosition(cursor.MoveOperation.Right, cursor.MoveMode.KeepAnchor, 24)
cursor.insertText(iso_string)
cursor.movePosition(cursor.MoveOperation.NextBlock, cursor.MoveMode.MoveAnchor)
i= i+1
cursor.endEditBlock()
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")
if __name__ == "__main__":
app = QtWidgets.QApplication([])
window = MainWindow()
window.show()
app.exec()
(I intentionally added a space as a placeholder for Zulu in the original display text to account for multiple runs of the function)
This works perfectly on a functional level, but the performance is atrocious (7x the runtime of just dropping the document, and creating a new one with the updated timestamps). As I do not have control of the content, it can contain the exact same timestamps that I have, and it can also have line breaks within the block.
What I do know that all blocks will start with the timestamp, I can control the format of the timestamp, and I have the source with timestamp and the content in a different variable, making it possible to drop everything.
Am I making some obviously performance intensive mistake here?
Iterating through all blocks is not efficient and, besides that, using findBlock()
is sub optimal, since QTextDocument already provides QTextDocument.begin()
and QTextBlock.next()
.
Also, there is no point in removing text and inserting, since insertion already removes any previous selection.
Since find(str)
automatically considers the current position of the given cursor, and searches through all the document (as opposed to the regex based functions), you can use a much simpler while loop:
def select_text(self):
start_time = time.perf_counter()
cursor = self.text_edit.textCursor()
cursor.beginEditBlock()
doc = self.text_edit.document()
find_cursor = QtGui.QTextCursor(doc.begin())
while True:
find_cursor = doc.find("2022", find_cursor)
if not find_cursor.isNull():
find_cursor.insertText("2023")
else:
break
cursor.endEditBlock()
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print("Elapsed time: {:.2f} seconds".format(elapsed_time))
That said, since you are using a plain text widget, it doesn't make a lot of sense to use the QTextDocument interface to process basic substitutions, and using regular expressions will be extremely faster, based on the simple and fast re.sub()
function.
Considering the basic case above, it would be just a matter of using a regex that matches the pattern at the beginning of the line.
You only need to convert new line characters (which in Qt become the Unicode Paragraphj Separator (U+2029
)), so that you can use the ^
identifier for the beginning of each line:
cursor.setPosition(0)
cursor.movePosition(cursor.End, cursor.KeepAnchor)
text = cursor.selectedText().replace('\u2029', '\n')
text = re.sub(r'^2022', '2023', text, flags=re.MULTILINE)
cursor.insertText(text)
This can also be achieved with multiple substitutions, for example using a simple dictionary:
substitutions = {
'2020': '2021',
'2022': '2023',
}
for x, y in substitutions.items():
text = re.sub(r'^{}'.format(x), y, text, flags=re.MULTILINE)
But, there's a catch.
What if those pairs are incompatible? Take, for instance, the following dictionary:
substitutions = {
'2020': '2021',
'2022': '2023',
'2021': '2022',
}
Considering the insertion order introduced with Python 3.7, you will end up with all 2020
matches replaced by 2022
instead of 2021
. And if you have no direct control over the insertion order (or you use Python<3.7), you may end up with all entries replaced with 2023
as well.
With that in mind, a possible solution is to use temporary placeholders and then replace them with the actual substitution.
In the following example I'm using the @
character in order to define (and later identify) the temporary placeholders: if 2020
should become 2021
and any previous 2021
should be 2022
, those will be replaced by @2021@
and @2022@
, respectively; in this way, any previous occurrence of 2021
(without the @
characters) will not be confused with @2021@
.
In the rare case a @<match>@
is found in the original text, a further @
character is added until no existing match exists.
placeholder = '@'
while True:
for k in substitutions.keys():
if re.search(re.compile('^' + placeholder + k + placeholder), text):
placeholder += '@'
break # restart search from the beginning
else:
break # no match with placeholders, yay!
final = {}
for x, y in substitutions.items():
repl = placeholder + y + placeholder
text = re.sub(r'^{}'.format(x), repl, text, flags=re.MULTILINE)
final[repl] = y
for x, y in final.items():
text = re.sub('^' + x, y, text, flags=re.MULTILINE)