I am having issues hosting a sign language detection application on DigitalOcean. It is supposed to use openCV to access the user's laptop webcam, sends the frames to the model to predict the sign the user is doing and show the translation in text. It works locally, but when run on the server, and accesses on a laptop, it displays a cannot find camera by index error. I have tried using other index values but the default camera index on a laptop should be 0.
This is the application code
from flask import Flask, render_template, Response
import cv2
from cvzone.ClassificationModule import Classifier
import sign_model
from cvzone.HandTrackingModule import HandDetector
camera_running = False
app = Flask(__name__)
camera = cv2.VideoCapture(0)
@app.route("/home")
def home():
return render_template('home.html')
@app.route("/choose-level")
def choose_level():
return render_template('levels.html')
@app.route("/text-to-asl")
def text_toASL():
return render_template('text-to-asl.html')
@app.route("/about")
def about():
return render_template('about.html')
@app.route("/level-one")
def level_one():
return render_template('level-one.html')
@app.route("/level-two")
def level_two():
return render_template('level-two.html')
def easy_mode():
global camera_running
global camera
detector = HandDetector(maxHands=1, detectionCon=0.7)
width = int(camera.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT))
classifier = Classifier("model/TESTWITH643_2.h5", "model/labels.txt")
while camera_running:
success, img = camera.read()
if not success:
break
# show the image with the landmarks
try:
imgOutput, score = sign_model.easy_mode(width, height, img, classifier)
hands, _ = detector.findHands(imgOutput, draw=True)
except cv2.error as e:
continue
try:
ret, buffer = cv2.imencode('.png', imgOutput)
if ret:
frame = buffer.tobytes()
yield (b'--frame\r\n'
b'Content-Type: image/png\r\n\r\n' + frame + b'\r\n')
except Exception as e:
pass
# release the camera once the loop is finished
camera.release()
camera = None
@app.route("/video_feed_l1")
def video_feed_l1():
return Response(easy_mode(), mimetype='multipart/x-mixed-replace; boundary=frame')
@app.route("/start_camera_l1")
def start_camera_l1():
global camera_running
global camera
if not camera_running:
# camera.release()
camera = cv2.VideoCapture(0)
camera_running = True
return "Camera already running"
@app.route("/stop_camera_l1")
def stop_camera_l1():
global camera_running
global camera
if camera_running:
camera_running = False
# camera.release()
return "Camera stopped"
return "Camera already stopped"
def hard_mode():
global camera_running
global camera
width = int(camera.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(camera.get(cv2.CAP_PROP_FRAME_HEIGHT))
detector = HandDetector(maxHands=1, detectionCon=0.7)
classifier = Classifier("model/TESTWITH643_2.h5", "model/labels.txt")
while camera_running:
success, img = camera.read()
if not success:
break
try:
imgOutput, score = sign_model.hard_mode(width, height, img, classifier)
hands, _ = detector.findHands(imgOutput, draw=True)
except cv2.error as e:
continue
try:
ret, buffer = cv2.imencode('.png', imgOutput)
if ret:
imgOutput = buffer.tobytes()
yield (b'--frame\r\n'
b'Content-Type: image/png\r\n\r\n' + imgOutput + b'\r\n')
except Exception as e:
pass
# wait for 0.5 seconds before capturing the next fram
camera.release()
camera = None
@app.route('/video_feed_l2')
def video_feed_l2():
return Response(hard_mode(), mimetype='multipart/x-mixed-replace; boundary=frame')
@app.route("/start_camera_l2")
def start_camera_l2():
global camera_running
global camera
if not camera_running:
# camera.release()
camera = cv2.VideoCapture(0)
camera_running = True
return "Camera already running"
@app.route("/stop_camera_l2")
def stop_camera_l2():
global camera_running
global camera
if camera_running:
camera_running = False
# camera.release()
return "Camera stopped"
return "Camera already stopped"
if __name__ == "__main__":
app.run(host='0.0.0.0', debug=True)
this is the code that handles the processing of the frames before sending to the model for prediction
import cv2
import numpy as np
import math
from cvzone.HandTrackingModule import HandDetector
from random import random
from english_words import english_words_lower_alpha_set
import time
words = [i for i in sorted(list(english_words_lower_alpha_set)) if ('z' not in i and 'j' not in i) and len(i) > 3 and len(i) <= 10]
start_time = time.time()
curr_time = 0
easy_word_user = ''
easy_word = words[int(random()*len(words))].upper()
easy_word_index = 0
#Main function
offset = 20
imgSize = 300
score = 0
def getLetter(result):
classLabels ={
0: 'A',
1: 'B',
2: 'C',
3: 'D',
4: 'E',
5: 'F',
6: 'G',
7: 'H',
8: 'I',
9: 'K',
10: 'L',
11: 'M',
12: 'N',
13: 'O',
14: 'P',
15: 'Q',
16: 'R',
17: 'S',
18: 'T',
19: 'U',
20: 'V',
21: 'W',
22: 'X',
23: 'Y',}
try:
res = int(result)
return classLabels[res]
except:
return "error"
def easy_mode(width, height, img, classifier):
global easy_word_user, easy_word, easy_word_index, curr_time, score
detector = HandDetector(maxHands=1)
letter_help = cv2.resize(cv2.imread('static/easy_mode_letters/{}.png'.format(easy_word[easy_word_index].lower())), (0,0), fx=0.2, fy=0.2)
imgOutput = img.copy()
try:
cv2.putText(imgOutput, easy_word, (int(width*0.05), int(height*0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_4)
cv2.putText(imgOutput, easy_word_user, (int(width*0.05), int(height*0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2, cv2.LINE_4)
cv2.putText(imgOutput, "User score: " + str(score), (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
except Exception as e:
print(e)
hands, img = detector.findHands(img, draw = True)
if hands:
hand = hands[0]
x, y, w, h = hand['bbox']
imgWhite = np.ones((imgSize, imgSize, 3), np.uint8) * 255
imgCrop = img[y - offset:y + h + offset, x - offset:x + w + offset]
aspectRatio = h / w
if aspectRatio > 1:
k = imgSize / h
wCalc = math.ceil(k * w)
imgResize = cv2.resize(imgCrop, (wCalc, imgSize))
wGap = math.ceil((imgSize - wCalc) / 2)
imgWhite[:, wGap:wCalc + wGap] = imgResize
prediction, index = classifier.getPrediction(imgWhite, draw = True)
else:
k = imgSize / w
hCalc = math.ceil(k * h)
imgResize = cv2.resize(imgCrop, (imgSize, hCalc))
hGap = math.ceil((imgSize - hCalc) / 2)
imgWhite[hGap:hCalc + hGap, :] = imgResize
prediction, index = classifier.getPrediction(imgWhite, draw = True)
if curr_time < round((time.time() - start_time)/3,1):
curr_time = round((time.time() - start_time)/3,1)
try:
max_index = np.argmax(prediction)
max_value = prediction[max_index]
print("Predicted:",getLetter(str(max_index)), ", pred prob:", max_value, ", current index:", easy_word_index, ", current time:", curr_time)
if max_value >= 0.8 and getLetter(str(max_index) in ['A','T','S','N','X']):
pred_letter = getLetter(str(max_index))
if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and (easy_word_index == 0 or easy_word[easy_word_index] != easy_word[easy_word_index - 1]):
easy_word_user += pred_letter
easy_word_index += 1
if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and easy_word_index > 0 and easy_word[easy_word_index] == easy_word[easy_word_index - 1] :
easy_word_user += pred_letter
easy_word_index += 1
if easy_word_user == easy_word:
time.sleep(0.5)
score += 1
easy_word = words[int(random()*len(words))].upper()
easy_word_index = 0
easy_word_user = ''
except Exception as e:
print(e)
imgOutput[5:5+letter_help.shape[0],width-5-letter_help.shape[1]:width-5] = letter_help
return imgOutput, score
# ret, frame = cap.read()
def hard_mode(width, height, img, classifier):
global easy_word_user, easy_word, easy_word_index, curr_time, score
detector = HandDetector(maxHands=1)
# letter_help = cv2.resize(cv2.imread('static/easy_mode_letters/{}.png'.format(easy_word[easy_word_index].lower())), (0,0), fx=0.2, fy=0.2)
imgOutput = img.copy()
try:
cv2.putText(imgOutput, easy_word, (int(width*0.05), int(height*0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2, cv2.LINE_4)
cv2.putText(imgOutput, easy_word_user, (int(width*0.05), int(height*0.95)), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2, cv2.LINE_4)
cv2.putText(imgOutput, "User score: " + str(score), (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
except Exception as e:
print(e)
hands, img = detector.findHands(img, draw = True)
if hands:
hand = hands[0]
x, y, w, h = hand['bbox']
imgWhite = np.ones((imgSize, imgSize, 3), np.uint8) * 255
imgCrop = img[y - offset:y + h + offset, x - offset:x + w + offset]
aspectRatio = h / w
if aspectRatio > 1:
k = imgSize / h
wCalc = math.ceil(k * w)
imgResize = cv2.resize(imgCrop, (wCalc, imgSize))
wGap = math.ceil((imgSize - wCalc) / 2)
imgWhite[:, wGap:wCalc + wGap] = imgResize
prediction, index = classifier.getPrediction(imgWhite, draw = True)
else:
k = imgSize / w
hCalc = math.ceil(k * h)
imgResize = cv2.resize(imgCrop, (imgSize, hCalc))
hGap = math.ceil((imgSize - hCalc) / 2)
imgWhite[hGap:hCalc + hGap, :] = imgResize
prediction, index = classifier.getPrediction(imgWhite, draw = True)
if curr_time < round((time.time() - start_time)/3,1):
curr_time = round((time.time() - start_time)/3,1)
try:
max_index = np.argmax(prediction)
max_value = prediction[max_index]
print("Predicted:",getLetter(str(max_index)), ", pred prob:", max_value, ", current index:", easy_word_index, ", current time:", curr_time)
if max_value >= 0.8 and getLetter(str(max_index) in ['A','T','S','N','X']):
pred_letter = getLetter(str(max_index))
if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and (easy_word_index == 0 or easy_word[easy_word_index] != easy_word[easy_word_index - 1]):
easy_word_user += pred_letter
easy_word_index += 1
if easy_word_index < len(easy_word) and pred_letter == easy_word[easy_word_index] and easy_word_index > 0 and easy_word[easy_word_index] == easy_word[easy_word_index - 1] :
easy_word_user += pred_letter
easy_word_index += 1
if easy_word_user == easy_word:
time.sleep(0.5)
score += 1
easy_word = words[int(random()*len(words))].upper()
easy_word_index = 0
easy_word_user = ''
except Exception as e:
print(e)
# imgOutput[5:5+letter_help.shape[0],width-5-letter_help.shape[1]:width-5] = letter_help
return imgOutput, score
I expected that the cv2.VideoCapture(0) function would automatically access the clients webcam and display the feed on the web application. What happened was that an erro was displayed saying cant find camera by index
I expected that the cv2.VideoCapture(0) function would automatically access the clients webcam and display the feed on the web application.
This assumption is incorrect and this is what's causing the issue you're experiencing. It's true that if you run the Python program on a laptop, then the laptop hardware is being used for both server and client. That's why the server is able to access the laptop's webcam. When you deploy your code to a remote server, it is no longer laptop hardware being used to run the server side code. Therefore, it no longer has access to a webcam.
You will need to grow your architecture to account for the fact that the client will be using different hardware than the server. You will need to implement some client side application that is able to access the webcam and transmit the frames to the server, at which point the server will take the frames and perform its work. There will be many ways to implement this, but for example, you could have the client send HTTP requests to the server where the frame is contained in the HTTP request body. The server side code could extract the frame from the request body.
You may also need to use more advanced technologies like websockets if you need the server to be able to send messages to the client to inform it of some important event that occurs after two or more frames have been processed (like "a batch of frames was processed, we detected movement!").