Search code examples
datasettext-recognition

How many character number does SynthText in the Wild Dataset have?


I downloaded SynthText in the Wild Dataset from official.

And then, I read official's readme.txt, however I couldn't find how many characters the dataset has. I googled it but couldn't find it...

As you can see below example image, some symbols such like .,: and - exists. So, this dataset has alphabets(27) + numbers(10) + some symbols(?).

enter image description here

Does anyone know it?


Solution

  • I implemented own code counting up symbols.

    def get_characters(basedir, imagedirname='SynthText', skip_missing=False):
    
        class Symbols:
            def __init__(self):
                self.symbols = set()
    
            def update(self, data):
                self.symbols = self.symbols.union(data)
    
            def __len__(self):
                return len(self.symbols)
    
            def __str__(self):
                return ''.join(self.symbols)
    
        symbols = Symbols()
    
        def csvgenerator(annodir, imagedir, cbb, wBB, imname, txts, symbols, **kwargs):
            image_num = kwargs.get('image_num')
            i = kwargs.get('i')
    
            imgpath = os.path.join(imagedir, imname)
    
            img = cv2.imread(imgpath)
            h, w, _ = img.shape
            if not os.path.exists(imgpath):
                if not skip_missing:
                    raise FileNotFoundError('{} was not found'.format(imgpath))
                else:
                    logging.warning('Missing image: {}'.format(imgpath))
                    raise _Skip()
    
    
            # convert txts to list of str
            # I don't know why txts is
            # ['Lines:\nI lost\nKevin ', 'will                ', 'line\nand            ',
            # 'and\nthe             ', '(and                ', 'the\nout             ',
            # 'you                 ', "don't\n pkg          "]
            # there is strange blank and the length of txts is different from the one of wBB
            txts = ' '.join(txts.tolist()).split()
            text_num = len(txts)
    
            if wBB.ndim == 2:
                # convert shape=(2, 4,) to (2, 4, 1)
                wBB = np.expand_dims(wBB, 2)
    
            assert text_num == wBB.shape[2], 'The length of text and wordBB must be same, but got {} and {}'.format(
                text_num, wBB.shape[2])
    
            # replace non-alphanumeric characters with *
            alltexts_asterisk = ''.join([re.sub(r'[^A-Za-z0-9]', '*', text) for text in txts])
            assert len(alltexts_asterisk) == cbb.shape[
                2], 'The length of characters and cbb must be same, but got {} and {}'.format(
                len(alltexts_asterisk), cbb.shape[2])
            for b in range(text_num):
                text = txts[b]
    
                symboltext = re.sub(r'[A-Za-z0-9]+', '', text)
    
                symbols.update(symboltext)
    
            sys.stdout.write('\r{}, and number is {}...{:0.1f}% ({}/{})'.format(symbols, len(symbols), 100 * (float(i + 1) / image_num), i + 1, image_num))
            sys.stdout.flush()
    
        _gtmatRecognizer(csvgenerator, basedir, imagedirname, customLog=True, symbols=symbols)
    
        print()
        print('symbols are {}, and number is {}'.format(symbols, len(symbols)))
    
    
    def _gtmatRecognizer(generator, basedir, imagedirname='SynthText', customLog=False, **kwargs):
        """
            convert gt.mat to https://github.com/MhLiao/TextBoxes_plusplus/blob/master/data/example.xml
    
            <annotation>
                <folder>train_images</folder>
                <filename>img_10.jpg</filename>
                <size>
                    <width>1280</width>
                    <height>720</height>
                    <depth>3</depth>
                </size>
                <object>
                    <difficult>1</difficult>
                    <content>###</content>
                    <name>text</name>
                    <bndbox>
                        <x1>1011</x1>
                        <y1>157</y1>
                        <x2>1079</x2>
                        <y2>160</y2>
                        <x3>1076</x3>
                        <y3>173</y3>
                        <x4>1011</x4>
                        <y4>170</y4>
                        <xmin>1011</xmin>
                        <ymin>157</ymin>
                        <xmax>1079</xmax>
                        <ymax>173</ymax>
                    </bndbox>
                </object>
                .
                .
                .
    
            </annotation>
    
            :param basedir: str, directory path under \'SynthText\'(, \'licence.txt\')
            :param imagedirname: (Optional) str, image directory name including \'gt.mat\
            :return:
            """
        logging.basicConfig(level=logging.INFO)
    
        imagedir = os.path.join(basedir, imagedirname)
        gtpath = os.path.join(imagedir, 'gt.mat')
    
        annodir = os.path.join(basedir, 'Annotations')
    
        if not os.path.exists(gtpath):
            raise FileNotFoundError('{} was not found'.format(gtpath))
    
        if not os.path.exists(annodir):
            # create Annotations directory
            os.mkdir(annodir)
    
        """
        ref: http://www.robots.ox.ac.uk/~vgg/data/scenetext/readme.txt
        gts = dict;
            __header__: bytes
            __version__: str
            __globals__: list
            charBB: object ndarray, shape = (1, image num). 
                    Character level bounding box. shape = (2=(x,y), 4=(top left,...: clockwise), BBox word num)
            wordBB: object ndarray, shape = (1, image num). 
                    Word level bounding box. shape = (2=(x,y), 4=(top left,...: clockwise), BBox char num)
            imnames: object ndarray, shape = (1, image num, 1).
            txt: object ndarray, shape = (i, image num).
                 Text. shape = (word num)
        """
        logging.info('Loading {} now.\nIt may take a while.'.format(gtpath))
        gts = sio.loadmat(gtpath)
        logging.info('Loaded\n'.format(gtpath))
    
        charBB = gts['charBB'][0]
        wordBB = gts['wordBB'][0]
        imnames = gts['imnames'][0]
        texts = gts['txt'][0]
    
        image_num = imnames.size
    
        for i, (cbb, wBB, imname, txts) in enumerate(zip(charBB, wordBB, imnames, texts)):
            imname = imname[0]
    
            try:
                generator(annodir, imagedir, cbb, wBB, imname, txts, i=i, image_num=image_num, **kwargs)
            except _Skip:
                pass
    
            if not customLog:
                sys.stdout.write('\rGenerating... {:0.1f}% ({}/{})'.format(100 * (float(i + 1) / image_num), i + 1, image_num))
            sys.stdout.flush()
    
    
        print()
        logging.info('Finished!!!')
    

    Finally, I got the symbol number. It appears that ASCII printable characters without space.

    INFO:root:Loading ~/data/text/SynthText/SynthText/gt.mat now.
    It may take a while.
    INFO:root:Loaded
    
    }&|%_(],$^{+?#@/-`).<=;~['>:\!"*, and number is 32...100.0% (858750/858750)
    INFO:root:Finished!!!
    
    symbols are }&|%_(],$^{+?#@/-`).<=;~['>:\!"*, and number is 32