Search code examples
androidandroid-camerax

Get all text inside box in a preview camerax android


i want to analyse all text that is just inside by box that i have in a preview camera. But im getting wrong coordinates for text. image

Validation is done on contains

class TestsPhotoscan : AppCompatActivity() {
private lateinit var binding: ActivityMainBinding
private var scaleX = 1F
private var scaleY = 1F
private var rectCrop = Rect()
private var cameraProvider: ProcessCameraProvider? = null
private lateinit var cameraProviderListenableFuture: ListenableFuture<ProcessCameraProvider>

override fun onCreate(savedInstanceState: Bundle?) {
    super.onCreate(savedInstanceState)
    binding = ActivityMainBinding.inflate(layoutInflater)
    setContentView(binding.root)

    cameraProviderListenableFuture = ProcessCameraProvider.getInstance(this)

    // Request camera permissions
    if (allPermissionsGranted()) {
        startCamera()
    } else {
        ActivityCompat.requestPermissions(
            this,
            REQUIRED_PERMISSIONS,
            REQUEST_CODE_PERMISSIONS
        )
    }

    binding.borderView.viewTreeObserver.addOnGlobalLayoutListener(object :
        ViewTreeObserver.OnGlobalLayoutListener {
        override fun onGlobalLayout() {
            binding.borderView.viewTreeObserver.removeOnGlobalLayoutListener(this)
            val points = IntArray(2)
            binding.borderView.getLocationOnScreen(points)
            rectCrop = Rect(
                points[0],
                points[1],
                points[0] + binding.borderView.width,
                points[1] + binding.borderView.height
            )
        }
    })
}


private fun allPermissionsGranted() = REQUIRED_PERMISSIONS.all {
    ContextCompat.checkSelfPermission(
        this, it
    ) == PackageManager.PERMISSION_GRANTED
}

@SuppressLint("UnsafeExperimentalUsageError")
private fun startCamera() {

    cameraProviderListenableFuture.addListener(Runnable {
        cameraProvider = cameraProviderListenableFuture.get()
        binding.viewFinder.post { setupCamera() }
    }, ContextCompat.getMainExecutor(this))

}

private fun buildPreviewUseCase(): Preview {
    val display = binding.viewFinder.display
    val metrics = DisplayMetrics().also { display.getMetrics(it) }
    val preview = Preview.Builder()
        .setTargetRotation(display.rotation)
        .setTargetResolution(Size(metrics.widthPixels, metrics.heightPixels))
        .build()
        .apply {
            setSurfaceProvider(binding.viewFinder.surfaceProvider)
        }

    return preview
}

private fun setupCamera() {

    cameraProviderListenableFuture.addListener({

        // Preview
        val preview = buildPreviewUseCase()

        val imageAnalyzer = ImageAnalysis.Builder()
            .build()
            .also {
                it.setAnalyzer(ContextCompat.getMainExecutor(this),
                    { processImage(it) })
            }

        // Select back camera as a default
        val cameraSelector = CameraSelector.DEFAULT_BACK_CAMERA

        val useCaseGroup = UseCaseGroup.Builder()
            .addUseCase(preview)
            .addUseCase(imageAnalyzer)
            .build()

        try {
            // Unbind use cases before rebinding
            cameraProvider?.unbindAll()

            // Bind use cases to camera
            cameraProvider?.bindToLifecycle(
                this, cameraSelector, useCaseGroup
            )

        } catch (exc: Exception) {
            Log.e(TAG, "Use case binding failed", exc)
        }

    }, ContextCompat.getMainExecutor(this))
}

@SuppressLint("UnsafeOptInUsageError")
private fun processImage(imageProxy: ImageProxy) {

    setScaleFactor(imageProxy)
    recognizeText(
        InputImage. fromMediaImage(
            imageProxy.image!!,
            imageProxy.imageInfo.rotationDegrees
        )
    ).addOnCompleteListener { imageProxy.close() }
}

private fun setScaleFactor(imageProxy: ImageProxy) {
    val viewWidth = binding.viewFinder.width.toFloat()
    val viewHeight = binding.viewFinder.height.toFloat()
    val imageWidth = imageProxy.width.toFloat()
    val imageHeight = imageProxy.height

    scaleX = viewWidth / imageWidth
    scaleY = viewHeight / imageHeight
}

private fun recognizeText(image: InputImage): Task<Text> {

    val recognizer = TextRecognition.getClient(TextRecognizerOptions.DEFAULT_OPTIONS)

    return recognizer.process(image)
        .addOnSuccessListener(
            ScopedExecutor(TaskExecutors.MAIN_THREAD),
            OnSuccessListener<Text> {
                for (block in it.textBlocks) {
                    for (line in block.lines) {
                        for (element in line.elements) {
                            if (rectCrop.contains(
                                    translateX(element.boundingBox?.left ?: -1).roundToInt(),
                                    translateY(element.boundingBox?.top ?: -1).roundToInt()
                                )
                            ) {
                           
                            println(element.text)
                           }
                        }
                    }
                }
                
            })
}

override fun onRequestPermissionsResult(
    requestCode: Int,
    permissions: Array<out String>,
    grantResults: IntArray
) {
    super.onRequestPermissionsResult(requestCode, permissions, grantResults)
    if (requestCode == REQUEST_CODE_PERMISSIONS) {
        if (allPermissionsGranted()) {
            startCamera()
        } else {
            Toast.makeText(
                this,
                "Permissions not granted by the user.",
                Toast.LENGTH_SHORT
            ).show()
            // finish()
        }
        return
    }
}


companion object {
    private const val TAG = "Mytag"
    private const val REQUEST_CODE_PERMISSIONS = 10
    private val REQUIRED_PERMISSIONS = arrayOf(Manifest.permission.CAMERA)
}

fun translateX(x: Int) =
    x * scaleX

fun translateY(y: Int) = y * scaleY

}

and layout

<?xml version="1.0" encoding="utf-8"?>
<layout xmlns:app="http://schemas.android.com/apk/res-auto">
<androidx.constraintlayout.widget.ConstraintLayout                                          
xmlns:android="http://schemas.android.com/apk/res/android"
android:layout_width="match_parent"
android:id="@+id/root"
android:layout_height="match_parent">

<androidx.camera.view.PreviewView
    android:id="@+id/viewFinder"
    android:layout_width="0dp"
    android:layout_height="0dp"
    app:layout_constraintBottom_toBottomOf="parent"
    app:layout_constraintEnd_toEndOf="parent"
    app:layout_constraintStart_toStartOf="parent"
    app:layout_constraintTop_toTopOf="parent" />

<View
    android:id="@+id/border_view"
    android:layout_width="match_parent"
    android:layout_height="250dp"
    android:layout_margin="16dp"
    android:background="@drawable/background_drawable"
    app:layout_constraintBottom_toBottomOf="@+id/viewFinder"
    app:layout_constraintEnd_toEndOf="parent"
    app:layout_constraintStart_toStartOf="parent"
    app:layout_constraintTop_toTopOf="parent" />

   </androidx.constraintlayout.widget.ConstraintLayout>
   </layout>

Solution

  • The difficulty that you are having is getting a good mapping from the image in the ImageProxy to what is displayed by the PreviewView. Although this sounds easy, I don't believe there is straightforward way to do this mapping. See the answer to a similar question. I took a look at implementing each of the suggestions in this answer and, although they worked in some situations, they failed in others. Of course, I could have taken the wrong approach.

    I have come to the conclusion that extracting and analyzing a bitmap extracted from the preview area and identifying those words that are completely enclosed by the red rectangle is the simplest. I circumscribe those words with their own red rectangle to show that they have been correctly identified.

    enter image description here

    The following is the reworked activity, a graphic overlay the produces the word boxes and the XML for the display. Comments are in the code. Good luck!

    TestPhotoscan.kt

    class TestsPhotoscan : AppCompatActivity() {
        private lateinit var binding: ActivityMainBinding
        private var wordFenceRect = Rect()
        private var cameraProvider: ProcessCameraProvider? = null
        private lateinit var cameraProviderListenableFuture: ListenableFuture<ProcessCameraProvider>
    
        override fun onCreate(savedInstanceState: Bundle?) {
            super.onCreate(savedInstanceState)
            binding = ActivityMainBinding.inflate(layoutInflater)
            setContentView(binding.root)
    
            cameraProviderListenableFuture = ProcessCameraProvider.getInstance(this)
    
            // Request camera permissions
            if (allPermissionsGranted()) {
                startCamera()
            } else {
                ActivityCompat.requestPermissions(
                    this,
                    REQUIRED_PERMISSIONS,
                    REQUEST_CODE_PERMISSIONS
                )
            }
        }
    
        private fun allPermissionsGranted() = REQUIRED_PERMISSIONS.all {
            ContextCompat.checkSelfPermission(
                this, it
            ) == PackageManager.PERMISSION_GRANTED
        }
    
        @SuppressLint("UnsafeExperimentalUsageError")
        private fun startCamera() {
            cameraProviderListenableFuture.addListener({
                cameraProvider = cameraProviderListenableFuture.get()
                binding.viewFinder.post { setupCamera() }
            }, ContextCompat.getMainExecutor(this))
        }
    
        private fun buildPreviewUseCase(): Preview {
            val display = binding.viewFinder.display
            val metrics = DisplayMetrics().also { display.getRealMetrics(it) }
            val rotation = display.rotation
    
            return Preview.Builder()
                .setTargetResolution(Size(metrics.widthPixels, metrics.heightPixels))
                .setTargetRotation(rotation)
                .build()
                .apply {
                    setSurfaceProvider(binding.viewFinder.surfaceProvider)
                }
        }
    
        @SuppressLint("UnsafeOptInUsageError")
        private fun setupCamera() {
            cameraProviderListenableFuture.addListener({
    
                // Preview
                val preview = buildPreviewUseCase()
    
                val imageAnalyzer = ImageAnalysis.Builder()
                    .build()
                    .also { it ->
                        it.setAnalyzer(ContextCompat.getMainExecutor(this),
                            { processImage(it) })
                    }
    
                // Select back camera as a default
                val cameraSelector = CameraSelector.DEFAULT_BACK_CAMERA
    
                val useCaseGroup = UseCaseGroup.Builder()
                    .addUseCase(preview)
                    .addUseCase(imageAnalyzer)
                    .build()
    
                try {
                    // Unbind use cases before rebinding
                    cameraProvider?.unbindAll()
    
                    // Bind use cases to camera
                    cameraProvider?.bindToLifecycle(
                        this, cameraSelector, useCaseGroup
                    )
    
                } catch (exc: Exception) {
                    Log.e(TAG, "Use case binding failed", exc)
                }
    
            }, ContextCompat.getMainExecutor(this))
        }
    
        @SuppressLint("UnsafeOptInUsageError")
        private fun processImage(imageProxy: ImageProxy) {
            // This code will display the image available in the ImageProxy within an inset view
            // if the inset view is visible to the user.
            //
            // The source for ImageUtils is at
            // https://github.com/googlesamples/mlkit/blob/master/android/translate-showcase/app/src/main/java/com/google/mlkit/showcase/translate/util/ImageUtils.kt
            if (binding.insetView.visibility == View.VISIBLE) {
                var imageBitmap = ImageUtils.convertYuv420888ImageToBitmap(imageProxy.image!!)
                imageBitmap = rotateBitmap(imageBitmap, imageProxy.imageInfo.rotationDegrees.toFloat())
                binding.insetView.setImageBitmap(imageBitmap)
            }
    
            // PreviewViews allow access to a bitmap representation of what the preview shows. This is
            // just a whole lot easier than mapping the ImageProxy image to what the PreviewView
            // displays on the screen.  See https://stackoverflow.com/a/63912198/6287910
            binding.viewFinder.bitmap?.apply {
                recognizeText(
                    InputImage.fromBitmap(this, 0)
                ).addOnCompleteListener { imageProxy.close() }
            }
        }
    
        private fun recognizeText(image: InputImage): Task<Text> {
            val recognizer = TextRecognition.getClient(TextRecognizerOptions.DEFAULT_OPTIONS)
    
            return recognizer.process(image)
                .addOnSuccessListener(
                    ScopedExecutor(TaskExecutors.MAIN_THREAD),
                    {
                        binding.wordFence.clearBoxes()
                        binding.wordFence.getHitRect(wordFenceRect)
                        for (block in it.textBlocks) {
                            for (line in block.lines) {
                                for (element in line.elements) {
                                    // For each word, check to make sure that the entire word is
                                    // contained with the word fence.
                                    if (isRectWithinRect(element.boundingBox, wordFenceRect)) {
                                        // Change the box boundary from the coordinate system of the
                                        // parent to the coordinates of the word fence.
                                        val outlineBox = Rect(element.boundingBox)
                                        outlineBox.offset(
                                            -binding.wordFence.left,
                                            -binding.wordFence.top
                                        )
                                        binding.wordFence.addBox(outlineBox)
                                    }
                                }
                            }
                        }
                        binding.wordFence.invalidate()
                    })
        }
    
        override fun onRequestPermissionsResult(
            requestCode: Int,
            permissions: Array<out String>,
            grantResults: IntArray
        ) {
            super.onRequestPermissionsResult(requestCode, permissions, grantResults)
            if (requestCode == REQUEST_CODE_PERMISSIONS) {
                if (allPermissionsGranted()) {
                    startCamera()
                } else {
                    Toast.makeText(
                        this,
                        "Permissions not granted by the user.",
                        Toast.LENGTH_SHORT
                    ).show()
                    // finish()
                }
                return
            }
        }
    
        private fun isRectWithinRect(enclosedRect: Rect?, enclosingRect: Rect) =
            enclosedRect != null && enclosingRect.contains(enclosedRect)
    
        private fun rotateBitmap(bitmap: Bitmap, rotation: Float) =
            Matrix().run {
                preRotate(rotation)
                Bitmap.createBitmap(
                    bitmap, 0, 0, bitmap.width, bitmap.height, this, true
                )
            }
    
        companion object {
            private const val TAG = "Applog"
            private const val REQUEST_CODE_PERMISSIONS = 10
            private val REQUIRED_PERMISSIONS = arrayOf(Manifest.permission.CAMERA)
        }
    }
    

    BoxedWordView.kt

    class BoxedWordView @JvmOverloads constructor(
        context: Context, attrs: AttributeSet? = null, defStyleAttr: Int = 0
    ) : View(context, attrs, defStyleAttr) {
    
        private val mBoxes = mutableListOf<Rect>()
        private val mPaint = Paint().apply {
            strokeWidth = 2f
            color = context.resources.getColor(android.R.color.holo_red_light)
            style = Paint.Style.STROKE
        }
    
        override fun onDrawForeground(canvas: Canvas) {
            super.onDrawForeground(canvas)
            for (box in mBoxes) {
                drawBox(canvas, box)
            }
        }
    
        private fun drawBox(canvas: Canvas, box: Rect) {
            canvas.drawRect(box, mPaint)
        }
    
        fun addBox(box: Rect) {
            mBoxes.add(box)
        }
    
        fun clearBoxes() {
            mBoxes.clear()
        }
    }
    

    activity_main.xml

    <layout>
    
        <androidx.constraintlayout.widget.ConstraintLayout
            android:id="@+id/root"
            android:layout_width="match_parent"
            android:layout_height="match_parent"
            android:background="@android:color/darker_gray">
    
            <androidx.camera.view.PreviewView
                android:id="@+id/viewFinder"
                android:layout_width="0dp"
                android:layout_height="0dp"
                app:layout_constraintBottom_toBottomOf="parent"
                app:layout_constraintEnd_toEndOf="parent"
                app:layout_constraintStart_toStartOf="parent"
                app:layout_constraintTop_toTopOf="parent" />
    
            <ImageView
                android:id="@+id/insetView"
                android:layout_width="wrap_content"
                android:layout_height="wrap_content"
                android:visibility="invisible"
                app:layout_constraintBottom_toBottomOf="parent"
                app:layout_constraintStart_toStartOf="parent"
                tools:srcCompat="@tools:sample/backgrounds/scenic" />
    
            <com.example.textrecognition.BoxedWordView
                android:id="@+id/wordFence"
                android:layout_width="0dp"
                android:layout_height="250dp"
                android:layout_margin="16dp"
                android:background="@drawable/background_drawable"
                app:layout_constraintBottom_toBottomOf="@+id/viewFinder"
                app:layout_constraintEnd_toEndOf="@id/viewFinder"
                app:layout_constraintStart_toStartOf="@id/viewFinder"
                app:layout_constraintTop_toTopOf="@id/viewFinder" />
        </androidx.constraintlayout.widget.ConstraintLayout>
    </layout>
    

    I will mention that the app sometimes freezes when coming back from the "recents" list. I might have introduced that problem, but be aware of it.