ImageDownloader.java
package ac.essex.ooechs.imaging.commons.apps.training;
import ac.essex.ooechs.imaging.commons.PixelLoader;
import ac.essex.ooechs.imaging.commons.Resizer;
import ac.essex.ooechs.imaging.commons.ImageWindow;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.Vector;
/**
* Downloads a set of images using ASK Image search and places them in a folder.
* Additional cropping options may be used.
*
* @author Olly Oechsle, University of Essex, 22-Feb-2006
* @version 1.0 Initial Version
* @version 1.01 Cleaned up with further comments, 26-Oct-2006
*/
public class ImageDownloader extends Thread {
public static final String SAVE_TO = "/home/ooechs/Desktop/downloaded-test-images/";
// different things to do with the image after downloading.
// probably best to leave the original. I have implemented
// a resizer using JAI (Java Advanced Imaging) but it isn't
// as high quality as ,say, ImageMagick.
public static final int LEAVE_ORIGINAL = 0;
public static final int RESIZE = 1;
public static final int CROP = 2;
public static void main(String args[]) {
// To do an image search and get all the images
new ImageDownloader("dog", 10, null).start();
// As above, but crop images to 320x240
new ImageDownloader("dog", 10, 320, 240, CROP, null).start();
// As above, but resize images to 320x240
new ImageDownloader("dog", 10, 320, 240, RESIZE, null).start();
// Alternative usage: download all the images in a directory:
// ( The following are a number of locations for getting faces.)
// Download all images from this url and suffix all the image filenames with "a"
new ImageDownloader("http://www.calvin.edu/news/photos/staff/", "a");
//new ImageDownloader("http://news-info.wustl.edu/images/staff/", "b");
//new ImageDownloader("http://www.recsports.vt.edu/staff/photos/", "c");
//new ImageDownloader("http://ib.berkeley.edu/images/staff/", "d");
//new ImageDownloader("http://astron.berkeley.edu/~blitz/Site%20Staff/", "e");
//new ImageDownloader("http://www.benjamin-club.lu/images/staff/", "f");
//new ImageDownloader("http://www.andrew.cmu.edu/org/hamerschlag/", "g"); //???
//new ImageDownloader("http://www.swdc.wa.gov.au/files/staff/", "h");
//new ImageDownloader("http://www.physics.unh.edu/people/studentphotos/", "j");
//new ImageDownloader("http://www.vision.ee.ethz.ch/members/images/spuhler.jpg", "k");
}
/**
* The keyword to search for
*/
protected String keyword;
/**
* Roughly how many images are required.
*/
protected int imagesRequired;
/**
* If cropping or resizing - what width to use?
*/
protected int width;
/**
* If cropping or resizing - what height to use?
*/
protected int height;
/**
* Mode:
* One of: LEAVE_ORIGINAL, CROP, or RESIZE
*/
protected int mode;
/**
* The program to be notified as the images are downloaded.
*/
ImageWaiter waiter;
/**
* Initialises the Image Downloader with settings but does not commence downloading.
* You need to call the run() function to begin downloading via the Thread's start() method.
* @param keyword The keyword(s) to search for. Put a plus sign (+) between multiple words.
* @param imagesRequired An indication of the number of images required.
* @param waiter The program to be notified as new images are downloaded. A waiter is a class
* that has implemented the ImageWaiter interface. You may pass null to this variable if you don't have
* a waiter class.
*/
public ImageDownloader(String keyword, int imagesRequired, ImageWaiter waiter) {
this(keyword, imagesRequired, -1, -1, LEAVE_ORIGINAL, waiter);
}
/**
* Initialises the Image Downloader with settings but does not commence downloading.
* You need to call the run() function to begin downloading via the Thread's start() method.
* @param keyword The keyword(s) to search for. Put a plus sign (+) between multiple words.
* @param imagesRequired An indication of the number of images required.
* @param width If the image is to be cropped or resized, it will be of this width.
* @param height If the image is to cropped or resized, it will be of this height
* @param waiter The program to be notified as new images are downloaded. A waiter is a class
* that has implemented the ImageWaiter interface. You may pass null to this variable if you don't have
* a waiter class.
*/
public ImageDownloader(String keyword, int imagesRequired, int width, int height, int mode, ImageWaiter waiter) {
this.keyword = keyword;
this.width = width;
this.height = height;
this.mode = mode;
this.waiter = waiter;
this.imagesRequired = imagesRequired;
}
/**
* Downloads all the images from a specific URL. Useful if you want to extract
* all the images from one of those directory listings pages.
* @param url
* @param filenameSuffix
*/
public ImageDownloader(String url, String filenameSuffix) {
this.keyword = filenameSuffix;
downloadDirectory(url);
}
/**
* Performs the image search. This has been implemented in a thread-like manner so
* a GUI can call it without pausing for a long time.
*/
public void run() {
System.out.println("Using ASK Image Search...");
int page = 1;
while (getSuccessCounter() < imagesRequired) {
System.out.println("Getting page: " + page);
ASKimageSearch("http://images.uk.ask.com/pictures?q=" + keyword + "&page=" + page);
page++;
}
System.out.println("Got: " + getSuccessCounter() + " images successfully ( of " + attemptsCounter + " attempts )");
}
private synchronized void ASKimageSearch(String s) {
try {
// create URL and make a connection
URLConnection connection = new URL(s).openConnection();
// Pretend to be a browser
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
// Prepare to download content from the URL
DataInputStream in = new DataInputStream(new BufferedInputStream(connection.getInputStream()));
StringBuilder sb = new StringBuilder(4096);
// save the url's contents to a string
while ((s = in.readLine()) != null) {
sb.append(s);
}
String page = sb.toString();
Vector<String> images = new Vector<String>(100);
// find the links we need
int index = 0;
while (true) {
String startMarker = "imagesrc%3D";
String endMarker = "%26";
index = page.indexOf(startMarker, index + 1);
if (index == -1) break;
int endIndex = page.indexOf(endMarker, index + 1);
if (endIndex == -1) break;
String image = decode(page.substring(index + startMarker.length(), endIndex));
// don't add the same image twice
if (!images.contains(image)) {
images.add(image);
//System.out.println(image);
}
index = endIndex + 1;
}
// finish the connection
connection = null;
// now download the image files
for (int i = 0; i < images.size(); i++) {
attemptsCounter++;
new Downloader(this, images.elementAt(i), keyword + attemptsCounter + ".jpg").start();
}
// and wait
try {
// wait until all images on page are downloaded
System.out.println("Waiting, got " + getSuccessCounter() + " images");
wait();
} catch (InterruptedException e) {
System.err.println("!! Interrupted");
}
} catch (Exception e) {
System.out.println("!! Exception caught: " + e.getMessage());
e.printStackTrace();
}
}
/**
* Downloads all images referred to at a particular URL.
* @param url The directory to look at. One of those Apache "index of" pages usually.
*/
public void downloadDirectory(String url) {
String s;
try {
// create URL and make a connection
URLConnection connection = new URL(url).openConnection();
// Pretend to be a browser
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
// Prepare to download content from the URL
DataInputStream in = new DataInputStream(new BufferedInputStream(connection.getInputStream()));
// Put the HTML into here
StringBuilder sb = new StringBuilder(4096);
// save the url's contents to a string
while ((s = in.readLine()) != null) {
sb.append(s);
}
// convert to a String
final String page = sb.toString();
// create a Vector to hold the image URLs we find
Vector<String> images = new Vector<String>(100);
// to help move through the document
int cursor = 0;
while (true) {
String startMarker = "href=\"";
String endMarker = "\"";
cursor = page.indexOf(startMarker, cursor + 1);
if (cursor == -1) break;
int endIndex = page.indexOf(endMarker, cursor + 1 + startMarker.length());
if (endIndex == -1) break;
String image = page.substring(cursor + startMarker.length(), endIndex);
// don't add the same image twice
if (!images.contains(image) && image.toLowerCase().endsWith(".jpg")) {
images.add(image);
}
cursor = endIndex + 1;
}
// finish the connection
connection = null;
if (!url.endsWith("/")) url += "/";
System.out.println("Found " + images.size() + " images.");
// now download the image files
for (int i = 0; i < images.size(); i++) {
attemptsCounter++;
new Downloader(this, url + images.elementAt(i), keyword + attemptsCounter + ".jpg").start();
System.out.println(images.elementAt(i));
}
} catch (Exception e) {
System.out.println("!! Exception caught: " + e.getMessage());
e.printStackTrace();
}
}
/**
* How many attempts have been made to download images?
*/
private int attemptsCounter = 0;
/**
* How many images have been successfully downloaded?
* We need to know this value so we know when to stop
* downloading.
*/
private int successCounter = 0;
/**
* Increments success counter. ImageDownloader uses multiple
* download threads, so this needs to be accessed via a synchronised
* method.
*/
private synchronized void incrementSuccessCounter() {
successCounter++;
}
/**
* Returns the success counter. ImageDownloader uses multiple
* download threads, so this needs to be accessed via a synchronised
* method.
*/
private synchronized int getSuccessCounter() {
return successCounter;
}
/**
* Stores the number of active threads we have. We need to know
* this so if all the threads die, the program won't wait indefinitely.
*/
private int threads = 0;
/**
* Increments the thread counter. ImageDownloader uses multiple
* download threads, so this needs to be accessed via a synchronised
* method.
*/
private synchronized void registerThread() {
threads++;
}
/**
* Decrements the thread counter. Called when a thread has completed downloading
* its image. ImageDownloader uses multiple
* download threads, so this needs to be accessed via a synchronised
* method.
*/
private synchronized void deregisterThread() {
threads--;
}
/**
* Returns how many threads are still downloading. The image search won't move onto
* the next page of imgaes until all threads have completed.
*/
private synchronized int countThreads() {
return threads;
}
/**
* Used to stop the main ImageDownloader thread from pausing while the images are downloaded.
* The final thread to deregister checks how many threads there are. If there are no more
* threads then the stopWaiting() method is called.
*/
private synchronized void stopWaiting() {
System.out.println("Stopped Waiting, got " + getSuccessCounter() + " images");
notify();
}
/**
* Very simple class to convert any URL encoded stuff into a
* nice decent ASCII string.
*/
private String decode(String string) {
char[] chars = string.toCharArray();
StringBuilder builder = new StringBuilder(string.length());
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
if (c == '%') {
if (chars[i + 1] == '2' && chars[i + 2] == '5') {
if (chars[i + 3] == '2' && chars[i + 4] == 'F') {
builder.append('/');
i += 4;
continue;
}
if (chars[i + 3] == '3' && chars[i + 4] == 'A') {
builder.append(":");
i += 4;
continue;
}
builder.append(c);
}
} else {
builder.append(c);
}
}
return builder.toString();
}
/**
* Takes an URL, and downloads the image contained therein to a specific
* filename.
*/
class Downloader extends Thread {
String httpURL;
ImageDownloader parent;
String filename;
public Downloader(ImageDownloader parent, String httpURL, String filename) {
registerThread();
this.httpURL = httpURL;
this.parent = parent;
this.filename = filename;
}
public void run() {
try {
// ensure the save directory exists, otherwise create it.
File directory = new File(SAVE_TO);
if (!directory.exists()) {
System.out.println("Creating directory: " + SAVE_TO);
directory.mkdirs();
}
String Imagefilename = httpURL.substring(httpURL.lastIndexOf('/') + 1).toLowerCase();
if (Imagefilename.endsWith("gif")) return;
System.out.println("Downloading: " + httpURL);
// create the url
URL url = new URL(httpURL);
InputStream in = url.openStream();
File file = new File(directory, filename);
FileOutputStream out = new FileOutputStream(file);
byte[] b = new byte[1024];
int len;
while ((len = in.read(b)) != -1) {
out.write(b, 0, len);
}
out.close();
System.out.println("Saved: " + filename);
PixelLoader image = new PixelLoader(file);
if (!image.loadedOK) {
file.delete();
} else {
switch (mode) {
case RESIZE:
// now resize the file
Resizer.resize(file, 250);
break;
case CROP:
// crop the file to a given size.
// if the file is smaller than the minimum size then it is deleted.
if (image.getWidth() > width && image.getHeight() > height) {
if (image.getWidth() > (width * 3)) {
// shrink
Resizer.resize(file, width * 3);
// re-load
image = new PixelLoader(file);
}
int dleft = image.getWidth() - width;
int dheight = image.getHeight() - height;
int rLeft = (int) (Math.random() * dleft);
int rTop = (int) (Math.random() * dheight);
SegmentedArea a = new SegmentedArea(rLeft, rTop, width, height);
image = image.getSubImage(a);
image.saveAs(file);
} else {
file.delete();
}
}
}
// indicate success
incrementSuccessCounter();
// give the image to the waiter.
if (waiter != null) waiter.recieveImage(file);
} catch (FileNotFoundException fe) {
System.err.println("File Not Found: " + httpURL);
} catch (Exception e) {
System.err.println("Exception caught: " + e.getMessage());
} finally {
// done - now de-activate myself
deregisterThread();
}
if (countThreads() < 1) {
// stop waiting
parent.stopWaiting();
}
}
}
}