I downloaded SynthText in the Wild Dataset from official.
And then, I read official's readme.txt, however I couldn't find how many characters the dataset has. I googled it but couldn't find it...
As you can see below example image, some symbols such like .
and -
exists. So, this dataset has alphabets(27) + numbers(10) + some symbols(?).
Does anyone know it?
I implemented own code counting up symbols.
def get_characters(basedir, imagedirname='SynthText', skip_missing=False):
class Symbols:
def __init__(self):
self.symbols = set()
def update(self, data):
self.symbols = self.symbols.union(data)
def __len__(self):
return len(self.symbols)
def __str__(self):
return ''.join(self.symbols)
symbols = Symbols()
def csvgenerator(annodir, imagedir, cbb, wBB, imname, txts, symbols, **kwargs):
image_num = kwargs.get('image_num')
i = kwargs.get('i')
imgpath = os.path.join(imagedir, imname)
img = cv2.imread(imgpath)
h, w, _ = img.shape
if not os.path.exists(imgpath):
if not skip_missing:
raise FileNotFoundError('{} was not found'.format(imgpath))
logging.warning('Missing image: {}'.format(imgpath))
raise _Skip()
# convert txts to list of str
# I don't know why txts is
# ['Lines:\nI lost\nKevin ', 'will ', 'line\nand ',
# 'and\nthe ', '(and ', 'the\nout ',
# 'you ', "don't\n pkg "]
# there is strange blank and the length of txts is different from the one of wBB
txts = ' '.join(txts.tolist()).split()
text_num = len(txts)
if wBB.ndim == 2:
# convert shape=(2, 4,) to (2, 4, 1)
wBB = np.expand_dims(wBB, 2)
assert text_num == wBB.shape[2], 'The length of text and wordBB must be same, but got {} and {}'.format(
text_num, wBB.shape[2])
# replace non-alphanumeric characters with *
alltexts_asterisk = ''.join([re.sub(r'[^A-Za-z0-9]', '*', text) for text in txts])
assert len(alltexts_asterisk) == cbb.shape[
2], 'The length of characters and cbb must be same, but got {} and {}'.format(
len(alltexts_asterisk), cbb.shape[2])
for b in range(text_num):
text = txts[b]
symboltext = re.sub(r'[A-Za-z0-9]+', '', text)
sys.stdout.write('\r{}, and number is {}...{:0.1f}% ({}/{})'.format(symbols, len(symbols), 100 * (float(i + 1) / image_num), i + 1, image_num))
_gtmatRecognizer(csvgenerator, basedir, imagedirname, customLog=True, symbols=symbols)
print('symbols are {}, and number is {}'.format(symbols, len(symbols)))
def _gtmatRecognizer(generator, basedir, imagedirname='SynthText', customLog=False, **kwargs):
convert gt.mat to https://github.com/MhLiao/TextBoxes_plusplus/blob/master/data/example.xml
:param basedir: str, directory path under \'SynthText\'(, \'licence.txt\')
:param imagedirname: (Optional) str, image directory name including \'gt.mat\
imagedir = os.path.join(basedir, imagedirname)
gtpath = os.path.join(imagedir, 'gt.mat')
annodir = os.path.join(basedir, 'Annotations')
if not os.path.exists(gtpath):
raise FileNotFoundError('{} was not found'.format(gtpath))
if not os.path.exists(annodir):
# create Annotations directory
ref: http://www.robots.ox.ac.uk/~vgg/data/scenetext/readme.txt
gts = dict;
__header__: bytes
__version__: str
__globals__: list
charBB: object ndarray, shape = (1, image num).
Character level bounding box. shape = (2=(x,y), 4=(top left,...: clockwise), BBox word num)
wordBB: object ndarray, shape = (1, image num).
Word level bounding box. shape = (2=(x,y), 4=(top left,...: clockwise), BBox char num)
imnames: object ndarray, shape = (1, image num, 1).
txt: object ndarray, shape = (i, image num).
Text. shape = (word num)
logging.info('Loading {} now.\nIt may take a while.'.format(gtpath))
gts = sio.loadmat(gtpath)
charBB = gts['charBB'][0]
wordBB = gts['wordBB'][0]
imnames = gts['imnames'][0]
texts = gts['txt'][0]
image_num = imnames.size
for i, (cbb, wBB, imname, txts) in enumerate(zip(charBB, wordBB, imnames, texts)):
imname = imname[0]
generator(annodir, imagedir, cbb, wBB, imname, txts, i=i, image_num=image_num, **kwargs)
except _Skip:
if not customLog:
sys.stdout.write('\rGenerating... {:0.1f}% ({}/{})'.format(100 * (float(i + 1) / image_num), i + 1, image_num))
Finally, I got the symbol number. It appears that ASCII printable characters without space.
INFO:root:Loading ~/data/text/SynthText/SynthText/gt.mat now.
It may take a while.
}&|%_(],$^{+?#@/-`).<=;~['>:\!"*, and number is 32...100.0% (858750/858750)
symbols are }&|%_(],$^{+?#@/-`).<=;~['>:\!"*, and number is 32