Last active
December 26, 2024 17:25
-
-
Save smeschke/aa989df78551a9050a78e0d7a8c50495 to your computer and use it in GitHub Desktop.
Aligns a scanned document to find optimal rotation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import cv2 | |
| import numpy as np | |
| out = cv2.VideoWriter('/home/stephen/Desktop/smooth_pose.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 60, (640,640)) | |
| src = 255 - cv2.imread('/home/stephen/Desktop/scan.jpg',0) | |
| scores = [] | |
| def rotate(img, angle): | |
| rows,cols = img.shape | |
| M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1) | |
| dst = cv2.warpAffine(img,M,(cols,rows)) | |
| return dst | |
| def sum_rows(img): | |
| # Create a list to store the row sums | |
| row_sums = [] | |
| # Iterate through the rows | |
| for r in range(img.shape[0]-1): | |
| # Sum the row | |
| row_sum = sum(sum(img[r:r+1,:])) | |
| # Add the sum to the list | |
| row_sums.append(row_sum) | |
| # Normalize range to (0,255) | |
| row_sums = (row_sums/max(row_sums)) * 255 | |
| # Return | |
| return row_sums | |
| def display_data(roi, row_sums, buffer): | |
| # Create background to draw transform on | |
| bg = np.zeros((buffer*2, buffer*2), np.uint8) | |
| # Iterate through the rows and draw on the background | |
| for row in range(roi.shape[0]-1): | |
| row_sum = row_sums[row] | |
| bg[row:row+1, :] = row_sum | |
| left_side = int(buffer/3) | |
| bg[:, left_side:] = roi[:,left_side:] | |
| cv2.imshow('bg1', bg) | |
| k = cv2.waitKey(1) | |
| out.write(cv2.cvtColor(cv2.resize(bg, (640,640)), cv2.COLOR_GRAY2BGR)) | |
| return k | |
| # Rotate the image around in a circle | |
| angle = 0 | |
| while angle <= 360: | |
| # Rotate the source image | |
| img = rotate(src, angle) | |
| # Crop the center 1/3rd of the image (roi is filled with text) | |
| h,w = img.shape | |
| buffer = min(h, w) - int(min(h,w)/1.5) | |
| roi = img[int(h/2-buffer):int(h/2+buffer), int(w/2-buffer):int(w/2+buffer)] | |
| # Create background to draw transform on | |
| bg = np.zeros((buffer*2, buffer*2), np.uint8) | |
| # Compute the sums of the rows | |
| row_sums = sum_rows(roi) | |
| # High score --> Zebra stripes | |
| score = np.count_nonzero(row_sums) | |
| scores.append(score) | |
| # Image has best rotation | |
| if score <= min(scores): | |
| # Save the rotatied image | |
| print('found optimal rotation') | |
| best_rotation = img.copy() | |
| k = display_data(roi, row_sums, buffer) | |
| if k == 27: break | |
| # Increment angle and try again | |
| angle += .5 | |
| cv2.destroyAllWindows() | |
| def area_to_top_of_text(img): | |
| # Create a background to draw on | |
| bg = np.zeros_like(img) | |
| # Iterate through the rows | |
| for position in range(w-1): | |
| # Find the top value in the column | |
| column = np.array(img[:,position:position+1]) | |
| top = np.argmax(column) | |
| # Fill in the area from the top of the page to top of the text | |
| a = position, 0 | |
| b = position, top | |
| cv2.line(img, a, b, 123, 1) | |
| cv2.line(bg, a, b, 255, 1) | |
| # Show and return | |
| cv2.imshow('img', img) | |
| cv2.waitKey(0) | |
| return img, bg | |
| # Find the area from the top of page to top of image | |
| _, bg = area_to_top_of_text(best_rotation.copy()) | |
| right_side_up = sum(sum(bg)) | |
| # Flip image and try again | |
| best_rotation_flipped = rotate(best_rotation, 180) | |
| _, bg = area_to_top_of_text(best_rotation_flipped.copy()) | |
| upside_down = sum(sum(bg)) | |
| # Check which area is larger | |
| if right_side_up < upside_down: aligned_image = best_rotation | |
| else: aligned_image = best_rotation_flipped | |
| # Save aligned image | |
| cv2.imwrite('/home/stephen/Desktop/best_rotation.png', 255-aligned_image) | |
| cv2.destroyAllWindows() |
Hey by any chance did you come up with the solution yet?
For those asking, here's a snip of code that returns a positive number if there is more area in the margins above, and a negative number if there is more area in the margins below. (Disclaimer: This is untested code):
def top_bot_margin_ratio(image: np.ndarray) -> float:
if len(image.shape) > 2 and image.shape[2] > 1:
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
above = 0
below = 0
for x in range(image.shape[1]):
col = np.argwhere(image[:, x] < 128)
if col.shape[0] > 0:
above += col[0, 0]
below += image.shape[0] - 1 - col[-1, 0]
return math.log(above / below)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi,i come from https://stackoverflow.com/questions/55654142/detect-if-a-text-image-is-upside-down,Can you provide some details about this function “area_to_top_of_text“,Thank you in advance。