Compare commits

...

5 Commits

Author SHA1 Message Date
yinguobing 48de8cfd8e update model with new weights 2021-03-09 17:55:13 +08:00
yinguobing 0a6bac609e remove eager excution swwitch 2021-03-09 16:03:54 +08:00
yinguobing 1a3eaaf464 rename model directory name 2021-03-09 15:19:37 +08:00
yinguobing ecb46c7e14 add model file for face detector 2021-03-09 15:19:02 +08:00
yinguobing ec62053728 replace old opencv face detector with new one.
The new face detector is a EfficientDet implementation trained with
face images.
2021-03-09 15:04:28 +08:00
12 changed files with 254 additions and 1994 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,16 +1,23 @@
"""Demo code shows how to estimate human head pose. """Demo code shows how to estimate human head pose.
Currently, human face is detected by a detector from an OpenCV DNN module.
Then the face box is modified a little to suits the need of landmark Three major steps for this code.
detection. The facial landmark detection is done by a custom Convolutional
Neural Network trained with TensorFlow. After that, head pose is estimated Step 1: face detection. The human faces are detected by a deep learning face
by solving a PnP problem. face_detector .Then the face boxes are modified a little to suits the need of
landmark detection.
Step 2: facial landmark detection. This is done by a custom Convolutional
Neural Network trained with TensorFlow.
Step 3: head pose estimation. The pose is estimated by solving a PnP problem.
All models and training code are available at: https://github.com/yinguobing/head-pose-estimation
""" """
from argparse import ArgumentParser from argparse import ArgumentParser
from multiprocessing import Process, Queue
import cv2 import cv2
import numpy as np import numpy as np
import tensorflow as tf
from face_detector import FaceDetector
from mark_detector import MarkDetector from mark_detector import MarkDetector
from os_detector import detect_os from os_detector import detect_os
from pose_estimator import PoseEstimator from pose_estimator import PoseEstimator
@ -18,12 +25,11 @@ from stabilizer import Stabilizer
print("OpenCV version: {}".format(cv2.__version__)) print("OpenCV version: {}".format(cv2.__version__))
# multiprocessing may not work on Windows and macOS, check OS for safety. devices = tf.config.list_physical_devices('GPU')
detect_os() for device in devices:
tf.config.experimental.set_memory_growth(device, True)
CNN_INPUT_SIZE = 128 # Parse arguments from user inputs.
# Take arguments from user input.
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument("--video", type=str, default=None, parser.add_argument("--video", type=str, default=None,
help="Video file to be processed.") help="Video file to be processed.")
@ -32,41 +38,36 @@ parser.add_argument("--cam", type=int, default=None,
args = parser.parse_args() args = parser.parse_args()
def get_face(detector, img_queue, box_queue):
"""Get face from image queue. This function is used for multiprocessing"""
while True:
image = img_queue.get()
box = detector.extract_cnn_facebox(image)
box_queue.put(box)
def main(): def main():
"""MAIN""" """Run human head pose estimation from video files."""
# Video source from webcam or video file.
# What is the threshold value for face detection.
threshold = 0.5
# Setup the video source. If no video file provided, the default webcam will
# be used.
video_src = args.cam if args.cam is not None else args.video video_src = args.cam if args.cam is not None else args.video
if video_src is None: if video_src is None:
print("Warning: video source not assigned, default webcam will be used.") print("Warning: video source not assigned, default webcam will be used.")
video_src = 0 video_src = 0
cap = cv2.VideoCapture(video_src) cap = cv2.VideoCapture(video_src)
# If reading frames from a webcam, try setting the camera resolution.
if video_src == 0: if video_src == 0:
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640) cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
_, sample_frame = cap.read()
# Introduce mark_detector to detect landmarks. # Get the real frame resolution.
mark_detector = MarkDetector() width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Setup process and queues for multiprocessing. # Introduce a mark_face_detector to detect face marks.
img_queue = Queue() detector_mark = MarkDetector("assets/mark_model")
box_queue = Queue()
img_queue.put(sample_frame)
box_process = Process(target=get_face, args=(
mark_detector, img_queue, box_queue,))
box_process.start()
# Introduce pose estimator to solve pose. Get one frame to setup the # Introduce a face face_detector to detect human faces.
# estimator according to the image size. detector_face = FaceDetector("assets/face_model")
height, width = sample_frame.shape[:2]
# Introduce pose estimator to solve pose.
pose_estimator = PoseEstimator(img_size=(height, width)) pose_estimator = PoseEstimator(img_size=(height, width))
# Introduce scalar stabilizers for pose. # Introduce scalar stabilizers for pose.
@ -76,9 +77,13 @@ def main():
cov_process=0.1, cov_process=0.1,
cov_measure=0.1) for _ in range(6)] cov_measure=0.1) for _ in range(6)]
# Introduce a metter to measure the FPS.
tm = cv2.TickMeter() tm = cv2.TickMeter()
# Loop through the video frames.
while True: while True:
tm.start()
# Read frame, crop it, flip it, suits your needs. # Read frame, crop it, flip it, suits your needs.
frame_got, frame = cap.read() frame_got, frame = cap.read()
if frame_got is False: if frame_got is False:
@ -91,38 +96,40 @@ def main():
if video_src == 0: if video_src == 0:
frame = cv2.flip(frame, 2) frame = cv2.flip(frame, 2)
# Pose estimation by 3 steps: # Preprocess the input image.
# 1. detect face; _image = detector_face.preprocess(frame)
# 2. detect landmarks;
# 3. estimate pose
# Feed frame to image queue. # Run the model
img_queue.put(frame) boxes, scores, classes = detector_face.predict(_image, threshold)
# Get face from box queue. # Transform the boxes into squares.
facebox = box_queue.get() boxes = detector_face.transform_to_square(
boxes, scale=1.22, offset=(0, 0.13))
if facebox is not None: # Clip the boxes if they cross the image boundaries.
# Detect landmarks from image of 128x128. boxes, _ = detector_face.clip_boxes(boxes, (0, 0, height, width))
face_img = frame[facebox[1]: facebox[3],
facebox[0]: facebox[2]]
face_img = cv2.resize(face_img, (CNN_INPUT_SIZE, CNN_INPUT_SIZE))
face_img = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
tm.start() # Detect facial marks.
marks = mark_detector.detect_marks(face_img) if boxes.size > 0:
tm.stop() # Get one face image.
facebox = boxes[0]
top, left, bottom, right = [int(x) for x in facebox]
face_image = frame[top:bottom, left:right]
# Run detection.
face_image = detector_mark.preprocess(face_image)
marks = detector_mark.predict(face_image)
# Convert the marks locations from local CNN to global image. # Convert the marks locations from local CNN to global image.
marks *= (facebox[2] - facebox[0]) marks *= (right - left)
marks[:, 0] += facebox[0] marks[:, 0] += left
marks[:, 1] += facebox[1] marks[:, 1] += top
# Uncomment following line to show raw marks.
# mark_detector.draw_marks(frame, marks, color=(0, 255, 0))
# Uncomment following line to show facebox. # Uncomment following line to show facebox.
# mark_detector.draw_box(frame, [facebox]) # detector_face.draw_box(frame, facebox, scores[0])
# Uncomment following line to show raw marks.
detector_mark.draw_marks(frame, marks, color=(0, 255, 0))
# Try pose estimation with 68 points. # Try pose estimation with 68 points.
pose = pose_estimator.solve_pose_by_68_points(marks) pose = pose_estimator.solve_pose_by_68_points(marks)
@ -141,20 +148,22 @@ def main():
# Uncomment following line to draw stabile pose annotation on frame. # Uncomment following line to draw stabile pose annotation on frame.
pose_estimator.draw_annotation_box( pose_estimator.draw_annotation_box(
frame, steady_pose[0], steady_pose[1], color=(128, 255, 128)) frame, steady_pose[0], steady_pose[1], color=(128, 255, 128))
# Uncomment following line to draw head axes on frame. # Uncomment following line to draw head axes on frame.
# pose_estimator.draw_axes(frame, steady_pose[0], steady_pose[1]) pose_estimator.draw_axes(frame, steady_pose[0], steady_pose[1])
tm.stop()
# Draw FPS on the screen's top left corner.
cv2.putText(frame, "FPS: {:.0f}".format(tm.getFPS()), (24, 24),
cv2.FONT_HERSHEY_DUPLEX, 0.8, (255, 255, 255), 1, cv2.LINE_AA)
# Show preview. # Show preview.
cv2.imshow("Preview", frame) cv2.imshow("Preview", frame)
if cv2.waitKey(10) == 27: if cv2.waitKey(1) == 27:
break break
# Clean up the multiprocessing process.
box_process.terminate()
box_process.join()
if __name__ == '__main__': if __name__ == '__main__':
main() main()

161
face_detector.py Normal file
View File

@ -0,0 +1,161 @@
"""Object detector based on EfficientDet model.
This module supports inferenceing with the official EfficientDet model.
For more details: https://github.com/yinguobing/efficientdet-runner
"""
import cv2
import numpy as np
import tensorflow as tf
class FaceDetector(object):
"""Mini module to run the EfficientDet model."""
def __init__(self, saved_model):
"""Build an EfficientDet model runner.
Args:
saved_model: the string path to the SavedModel.
"""
self.scale_width = 0
self.scale_height = 0
self.input_size = 512
# Load the SavedModel object.
imported = tf.saved_model.load(saved_model)
self._predict_fn = imported.signatures["serving_default"]
# To avoid garbage collected by Python, see TensorFlow issue:37615
self._predict_fn._backref_to_saved_model = imported
def preprocess(self, image):
"""Preprocess the input image."""
# Scale the image first.
height, width, _ = image.shape
self.ratio = self.input_size / max(height, width)
image = cv2.resize(
image, (int(self.ratio * width), int(self.ratio * height)))
# Convert to RGB.
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Then pad the image to input size.
self.padding_h = self.input_size - int(self.ratio * width)
self.padding_v = self.input_size - int(self.ratio * height)
image = cv2.copyMakeBorder(
image, 0, self.padding_v, 0, self.padding_h, cv2.BORDER_CONSTANT, (0, 0, 0))
return image
def __filter(self, detections, threshold):
"""Filter the detection results by score threshold."""
# Get the detection results.
boxes = detections['output_0'].numpy()[0]
scores = detections['output_1'].numpy()[0]
classes = detections['output_2'].numpy()[0]
# Filter out the results by score threshold.
mask = scores > threshold
boxes = boxes[mask]
scores = scores[mask]
classes = classes[mask]
return boxes, scores, classes
@tf.function
def _predict(self, images):
return self._predict_fn(images)
def predict(self, image, threshold):
"""Run inference with image inputs.
Args:
image: a numpy array as an input image.
Returns:
predictions: result.
"""
frame_tensor = tf.constant(image, dtype=tf.uint8)
frame_tensor = tf.expand_dims(frame_tensor, axis=0)
detections = self._predict(frame_tensor)
boxes, scores, classes = self.__filter(detections, threshold)
# Scale the box back to the original image size.
boxes /= self.ratio
# Crop out the padding area.
boxes[:, 2] = np.minimum(
boxes[:, 2], (self.input_size - self.padding_v)/self.ratio)
boxes[:, 1] = np.minimum(
boxes[:, 1], (self.input_size - self.padding_h)/self.ratio)
return boxes, scores, classes
def transform_to_square(self, boxes, scale=1.0, offset=(0, 0)):
"""Get the square bounding boxes.
Args:
boxes: input boxes [[ymin, xmin, ymax, xmax], ...]
scale: ratio to scale the boxes
offset: a tuple of offset to move the boxes (x, y)
Returns:
square boxes.
"""
ymins, xmins, ymaxs, xmaxs = np.split(boxes, 4, 1)
width = xmaxs - xmins
height = ymaxs - ymins
# How much to move.
offset_x = offset[0] * width
offset_y = offset[1] * height
# Where is the center location.
center_x = np.floor_divide(xmins + xmaxs, 2) + offset_x
center_y = np.floor_divide(ymins + ymaxs, 2) + offset_y
# Make them squares.
margin = np.floor_divide(np.maximum(height, width) * scale, 2)
boxes = np.concatenate((center_y-margin, center_x-margin,
center_y+margin, center_x+margin), axis=1)
return boxes
def clip_boxes(self, boxes, margins):
"""Clip the boxes to the safe margins.
Args:
boxes: input boxes [[ymin, xmin, ymax, xmax], ...].
margins: a tuple of 4 int (top, left, bottom, right) as safe margins.
Returns:
boxes: clipped boxes.
clip_mark: the mark of clipped sides.
"""
top, left, bottom, right = margins
clip_mark = (boxes[:, 0] < top, boxes[:, 1] < left,
boxes[:, 2] > bottom, boxes[:, 3] > right)
boxes[:, 0] = np.maximum(boxes[:, 0], top)
boxes[:, 1] = np.maximum(boxes[:, 1], left)
boxes[:, 2] = np.minimum(boxes[:, 2], bottom)
boxes[:, 3] = np.minimum(boxes[:, 3], right)
return boxes, clip_mark
def draw_box(self, image, box, score, color=(0, 255, 0)):
"""Draw the bounding box.
Args:
boxes: the face box.
color: the color of the box.
scores: detection score.
"""
y0, x0, y1, x1 = [int(b) for b in box]
cv2.rectangle(image, (x0, y0), (x1, y1), (0, 255, 0), 2)
cv2.putText(image, "Face:{:.2f}".format(score),
(x0, y0-7), cv2.FONT_HERSHEY_DUPLEX, 0.5, color,
1, cv2.LINE_AA)

View File

@ -5,163 +5,42 @@ import tensorflow as tf
from tensorflow import keras from tensorflow import keras
class FaceDetector:
"""Detect human face from image"""
def __init__(self,
dnn_proto_text='assets/deploy.prototxt',
dnn_model='assets/res10_300x300_ssd_iter_140000.caffemodel'):
"""Initialization"""
self.face_net = cv2.dnn.readNetFromCaffe(dnn_proto_text, dnn_model)
self.detection_result = None
def get_faceboxes(self, image, threshold=0.5):
"""
Get the bounding box of faces in image using dnn.
"""
rows, cols, _ = image.shape
confidences = []
faceboxes = []
self.face_net.setInput(cv2.dnn.blobFromImage(
image, 1.0, (300, 300), (104.0, 177.0, 123.0), False, False))
detections = self.face_net.forward()
for result in detections[0, 0, :, :]:
confidence = result[2]
if confidence > threshold:
x_left_bottom = int(result[3] * cols)
y_left_bottom = int(result[4] * rows)
x_right_top = int(result[5] * cols)
y_right_top = int(result[6] * rows)
confidences.append(confidence)
faceboxes.append(
[x_left_bottom, y_left_bottom, x_right_top, y_right_top])
self.detection_result = [faceboxes, confidences]
return confidences, faceboxes
def draw_all_result(self, image):
"""Draw the detection result on image"""
for facebox, conf in self.detection_result:
cv2.rectangle(image, (facebox[0], facebox[1]),
(facebox[2], facebox[3]), (0, 255, 0))
label = "face: %.4f" % conf
label_size, base_line = cv2.getTextSize(
label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
cv2.rectangle(image, (facebox[0], facebox[1] - label_size[1]),
(facebox[0] + label_size[0],
facebox[1] + base_line),
(0, 255, 0), cv2.FILLED)
cv2.putText(image, label, (facebox[0], facebox[1]),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))
class MarkDetector: class MarkDetector:
"""Facial landmark detector by Convolutional Neural Network""" """Facial landmark detector by Convolutional Neural Network"""
def __init__(self, saved_model='assets/pose_model'): def __init__(self, saved_model):
"""Initialization""" """Initialization"""
# A face detector is required for mark detection. # A face detector is required for mark detection.
self.face_detector = FaceDetector() self.input_size = 128
self.cnn_input_size = 128 # Load the SavedModel object.
self.marks = None imported = tf.saved_model.load(saved_model)
self._predict_fn = imported.signatures["serving_default"]
self._predict_fn._backref_to_saved_model = imported
# Restore model from the saved_model file. def preprocess(self, image):
self.model = keras.models.load_model(saved_model) """Preprocess the input images."""
image = cv2.resize(image, (self.input_size, self.input_size))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
@staticmethod return image
def draw_box(image, boxes, box_color=(255, 255, 255)):
"""Draw square boxes on image"""
for box in boxes:
cv2.rectangle(image,
(box[0], box[1]),
(box[2], box[3]), box_color, 3)
@staticmethod @tf.function
def move_box(box, offset): def _predict(self, images):
"""Move the box to direction specified by vector offset""" return self._predict_fn(image_input=images)
left_x = box[0] + offset[0]
top_y = box[1] + offset[1]
right_x = box[2] + offset[0]
bottom_y = box[3] + offset[1]
return [left_x, top_y, right_x, bottom_y]
@staticmethod def predict(self, image):
def get_square_box(box):
"""Get a square box out of the given box, by expanding it."""
left_x = box[0]
top_y = box[1]
right_x = box[2]
bottom_y = box[3]
box_width = right_x - left_x
box_height = bottom_y - top_y
# Check if box is already a square. If not, make it a square.
diff = box_height - box_width
delta = int(abs(diff) / 2)
if diff == 0: # Already a square.
return box
elif diff > 0: # Height > width, a slim box.
left_x -= delta
right_x += delta
if diff % 2 == 1:
right_x += 1
else: # Width > height, a short box.
top_y -= delta
bottom_y += delta
if diff % 2 == 1:
bottom_y += 1
# Make sure box is always square.
assert ((right_x - left_x) == (bottom_y - top_y)), 'Box is not square.'
return [left_x, top_y, right_x, bottom_y]
@staticmethod
def box_in_image(box, image):
"""Check if the box is in image"""
rows = image.shape[0]
cols = image.shape[1]
return box[0] >= 0 and box[1] >= 0 and box[2] <= cols and box[3] <= rows
def extract_cnn_facebox(self, image):
"""Extract face area from image."""
_, raw_boxes = self.face_detector.get_faceboxes(
image=image, threshold=0.9)
for box in raw_boxes:
# Move box down.
offset_y = int(abs((box[3] - box[1]) * 0.12))
box_moved = self.move_box(box, [0, offset_y])
# Make box square.
facebox = self.get_square_box(box_moved)
if self.box_in_image(facebox, image):
return facebox
return None
def detect_marks(self, images):
"""Detect marks from images""" """Detect marks from images"""
# # Actual detection. # Actual detection.
marks = self.model.predict(tf.expand_dims(images, axis=0)) marks = self._predict(tf.convert_to_tensor([image], dtype=tf.float32))
# Convert predictions to landmarks. # Convert predictions to landmarks.
marks = np.reshape(marks, (-1, 2)) marks = np.reshape(marks['dense_1'].numpy(), (-1, 2))
return marks return marks
@staticmethod def draw_marks(self, image, marks, color=(255, 255, 255)):
def draw_marks(image, marks, color=(255, 255, 255)):
"""Draw mark points on image""" """Draw mark points on image"""
for mark in marks: for mark in marks:
cv2.circle(image, (int(mark[0]), int( cv2.circle(image, (int(mark[0]), int(