Created
April 25, 2013 21:03
-
-
Save bwbaugh/5463151 to your computer and use it in GitHub Desktop.
Detecting a Specific Watermark in a Photo with Python Get example training and testing images here:
<http://bwbaugh.com/stack-overflow/16222178_watermark.tar> Stack Overflow question:
<http://stackoverflow.com/questions/16222178/detecting-a-specific-watermark-in-a-photo-with-python-without-scipy>
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (C) 2013 Wesley Baugh | |
"""Tools for text classification. | |
Extracted from the [infer](https://github.com/bwbaugh/infer) library. | |
""" | |
from __future__ import division | |
import math | |
from collections import defaultdict, namedtuple, Counter | |
from fractions import Fraction | |
class MultinomialNB(object): | |
"""Multinomial Naive Bayes for text classification. | |
Attributes: | |
exact: Boolean indicating if exact probabilities should be | |
returned as a `Fraction`. Otherwise, speed up computations | |
but only return probabilities as a `float`. (default False) | |
laplace: Smoothing parameter >= 0. (default 1) | |
top_features: Number indicating the top-k most common features | |
to use during classification, sorted by the frequency the | |
feature has been seen (a count is kept for each label). This | |
is a form of feature selection because any feature that has | |
a frequency less than any of the top-k most common features | |
is ignored during classification. This value must be set | |
before any training of the classifier. (default None) | |
Properties: | |
labels: Set of all class labels. | |
vocabulary: Set of vocabulary across all class labels. | |
""" | |
Prediction = namedtuple('Prediction', 'label confidence') | |
def __init__(self, *documents): | |
"""Create a new Multinomial Naive Bayes classifier. | |
Args: | |
documents: Optional list of document-label pairs for training. | |
""" | |
self.exact = False | |
self.laplace = 1 | |
self.top_features = None | |
# Dictionary of sets of vocabulary by label. | |
self._label_vocab = defaultdict(set) | |
# Dictionary of times a label has been seen. | |
self._label_count = Counter() | |
# Dictionary of number of feature seen in all documents by label. | |
self._label_length = Counter() | |
# Dictionary of times a feature has been seen by label. | |
self._label_feature_count = defaultdict(Counter) | |
# Size of vocabulary across all class labels. | |
self._vocab_size = 0 | |
if documents: | |
self.train(*documents) | |
@property | |
def labels(self): | |
"""Set of all class labels. | |
Returns: | |
Example: set(['positive', 'negative']) | |
""" | |
return set(label for label in self._label_count) | |
@property | |
def vocabulary(self): | |
"""Set of vocabulary (features) seen in any class label.""" | |
label_vocab = [self._label_vocab[x] for x in self._label_vocab] | |
return set().union(*label_vocab) | |
def train(self, *documents): | |
"""Train the classifier on a document-label pair(s). | |
Args: | |
documents: Tuple of (document, label) pair(s). Documents | |
must be a collection of features. The label can be any | |
hashable object, though is usually a string. | |
""" | |
for document, label in documents: | |
# Python 3: isinstance(document, str) | |
if isinstance(document, basestring): | |
raise TypeError('Documents must be a collection of features') | |
self._label_count[label] += 1 | |
for feature in document: | |
# Check if the feature hasn't been seen before for any label. | |
if not any(feature in self._label_vocab[x] for x in self.labels): | |
self._vocab_size += 1 | |
self._label_vocab[label].add(feature) | |
self._label_feature_count[label][feature] += 1 | |
self._label_length[label] += 1 | |
if self.top_features: | |
if not hasattr(self, '_most_common'): | |
x = lambda: MostCommon(self.top_features) | |
self._most_common = defaultdict(x) | |
y = self._label_feature_count[label][feature] | |
self._most_common[label][feature] = y | |
def prior(self, label): | |
"""Prior probability of a label. | |
Args: | |
label: The target class label. | |
self.exact | |
Returns: | |
The number of training instances that had the target | |
`label`, divided by the total number of training instances. | |
""" | |
if label not in self.labels: | |
raise KeyError(label) | |
total = sum(self._label_count.values()) | |
if self.exact: | |
return Fraction(self._label_count[label], total) | |
else: | |
return self._label_count[label] / total | |
def conditional(self, feature, label): | |
"""Conditional probability for a feature given a label. | |
Args: | |
feature: The target feature. | |
label: The target class label. | |
self.laplace | |
self.exact | |
Returns: | |
The number of times the feature has been present across all | |
training documents for the `label`, divided by the sum of | |
the length of every training document for the `label`. | |
""" | |
# Note we use [Laplace smoothing][laplace]. | |
# [laplace]: https://en.wikipedia.org/wiki/Additive_smoothing | |
if label not in self.labels: | |
raise KeyError(label) | |
# Times feature seen across all documents in a label. | |
numer = self.laplace | |
# Avoid creating an entry if the term has never been seen | |
if feature in self._label_feature_count[label]: | |
numer += self._label_feature_count[label][feature] | |
denom = self._label_length[label] + (self._vocab_size * self.laplace) | |
if self.exact: | |
return Fraction(numer, denom) | |
else: | |
return numer / denom | |
def _score(self, document, label): | |
"""Multinomial raw score of a document given a label. | |
Args: | |
document: Collection of features. | |
label: The target class label. | |
self.exact | |
Returns: | |
The multinomial raw score of the `document` given the | |
`label`. In order to turn the raw score into a confidence | |
value, this value should be divided by the sum of the raw | |
scores across all class labels. | |
""" | |
if isinstance(document, basestring): | |
raise TypeError('Documents must be a list of features') | |
if self.exact: | |
score = self.prior(label) | |
else: | |
score = math.log(self.prior(label)) | |
for feature in document: | |
# Feature selection by only considering the top-k | |
# most common features (a form of dictionary trimming). | |
if self.top_features and feature not in self._most_common[label]: | |
continue | |
conditional = self.conditional(feature, label) | |
if self.exact: | |
score *= conditional | |
else: | |
score += math.log(conditional) | |
return score | |
def _compute_scores(self, document): | |
"""Compute the multinomial score of a document for all labels. | |
Args: | |
document: Collection of features. | |
Returns: | |
A dict mapping class labels to the multinomial raw score | |
for the `document` given the label. | |
""" | |
return {x: self._score(document, x) for x in self.labels} | |
def prob_all(self, document): | |
"""Probability of a document for all labels. | |
Args: | |
document: Collection of features. | |
self.exact | |
Returns: | |
A dict mapping class labels to the confidence value that the | |
`document` belongs to the label. | |
""" | |
score = self._compute_scores(document) | |
if not self.exact: | |
# If the log-likelihood is too small, when we convert back | |
# using `math.exp`, the result will round to zero. | |
normalize = max(score.itervalues()) | |
assert normalize <= 0, normalize | |
score = {x: math.exp(score[x] - normalize) for x in score} | |
total = sum(score[x] for x in score) | |
assert total > 0, (total, score, normalize) | |
if self.exact: | |
return {label: Fraction(score[label], total) for label in | |
self.labels} | |
else: | |
return {label: score[label] / total for label in self.labels} | |
def prob(self, document, label): | |
"""Probability of a document given a label. | |
Args: | |
document: Collection of features. | |
label: The target class label. | |
Returns: | |
The confidence value that the `document` belongs to `label`. | |
""" | |
prob = self.prob_all(document)[label] | |
return prob | |
def classify(self, document): | |
"""Get the most confident class label for a document. | |
Args: | |
document: Collection of features. | |
Returns: | |
A namedtuple representing the most confident class `label` | |
and the value of the `confidence` in the label. For example: | |
As tuple: | |
('positive', 0.85) | |
As namedtuple: | |
Prediction(label='positive', confidence=0.85) | |
""" | |
prob = self.prob_all(document) | |
label = max(prob, key=prob.get) | |
return self.Prediction(label, prob[label]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (C) 2013 Wesley Baugh | |
"""Detect watermark in images. | |
### Requires | |
- [Pillow](https://pypi.python.org/pypi/Pillow/2.0.0) | |
""" | |
import glob | |
from classify import MultinomialNB | |
from PIL import Image | |
TRAINING_POSITIVE = 'training-positive/*.jpg' | |
TRAINING_NEGATIVE = 'training-negative/*.jpg' | |
TEST_POSITIVE = 'test-positive/*.jpg' | |
TEST_NEGATIVE = 'test-negative/*.jpg' | |
# How many pixels to grab from the top-right of image. | |
CROP_WIDTH, CROP_HEIGHT = 100, 100 | |
RESIZED = (16, 16) | |
def get_image_data(infile): | |
image = Image.open(infile) | |
width, height = image.size | |
# left upper right lower | |
box = width - CROP_WIDTH, 0, width, CROP_HEIGHT | |
region = image.crop(box) | |
resized = region.resize(RESIZED) | |
data = resized.getdata() | |
# Convert RGB to simple averaged value. | |
data = [sum(pixel) / 3 for pixel in data] | |
# Combine location and value. | |
values = [] | |
for location, value in enumerate(data): | |
values.extend([location] * value) | |
return values | |
def main(): | |
watermark = MultinomialNB() | |
# Training | |
count = 0 | |
for infile in glob.glob(TRAINING_POSITIVE): | |
data = get_image_data(infile) | |
watermark.train((data, 'positive')) | |
count += 1 | |
print 'Training', count | |
for infile in glob.glob(TRAINING_NEGATIVE): | |
data = get_image_data(infile) | |
watermark.train((data, 'negative')) | |
count += 1 | |
print 'Training', count | |
# Testing | |
correct, total = 0, 0 | |
for infile in glob.glob(TEST_POSITIVE): | |
data = get_image_data(infile) | |
prediction = watermark.classify(data) | |
if prediction.label == 'positive': | |
correct += 1 | |
total += 1 | |
print 'Testing ({0} / {1})'.format(correct, total) | |
for infile in glob.glob(TEST_NEGATIVE): | |
data = get_image_data(infile) | |
prediction = watermark.classify(data) | |
if prediction.label == 'negative': | |
correct += 1 | |
total += 1 | |
print 'Testing ({0} / {1})'.format(correct, total) | |
print 'Got', correct, 'out of', total, 'correct' | |
if __name__ == '__main__': | |
main() |
Looks interesting. Does it get the "specific watermark" from the training set? Because I can't see it been provided as input. Please confirm. Thanks :)
FYI , I see you specify to use pillow 2.0, but if Im dumb and ignore that and use pillow 4.0, I only get 9 out 10 matches using python 2.7.5 . Nothing a bit more training probably wouldnt solve though! Nice work btw!
2023, used python 3.9 with pillow 4.0. Got 9/10 with the provided images also. Added my personal images to training pos/neg and tests. Got 16/21
Edit: Solved it looking at the code more thoroughly.
You have to use your custom CROP_WITDH and CROP_HEIGHT and also modify the crop coordinates according to your training images examples. Got 11/11 after that.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python2.7 needed