ImageDownloader.java

package ac.essex.ooechs.imaging.commons.apps.training; 
 
import ac.essex.ooechs.imaging.commons.PixelLoader; 
import ac.essex.ooechs.imaging.commons.Resizer; 
import ac.essex.ooechs.imaging.commons.ImageWindow;
 
import java.io.*; 
import java.net.URL; 
import java.net.URLConnection; 
import java.util.Vector; 
 
/** 
 * Downloads a set of images using ASK Image search and places them in a folder. 
 * Additional cropping options may be used. 
 * 
 * @author Olly Oechsle, University of Essex, 22-Feb-2006 
 * @version 1.0 Initial Version 
 * @version 1.01 Cleaned up with further comments, 26-Oct-2006 
 */ 
public class ImageDownloader extends Thread { 
 
    public static final String SAVE_TO = "/home/ooechs/Desktop/downloaded-test-images/"; 
 
    // different things to do with the image after downloading. 
    // probably best to leave the original. I have implemented 
    // a resizer using JAI (Java Advanced Imaging) but it isn't 
    // as high quality as ,say, ImageMagick. 
    public static final int LEAVE_ORIGINAL = 0; 
    public static final int RESIZE = 1; 
    public static final int CROP = 2; 
 
    public static void main(String args[]) { 
 
        // To do an image search and get all the images 
        new ImageDownloader("dog", 10, null).start(); 
 
        // As above, but crop images to 320x240 
        new ImageDownloader("dog", 10, 320, 240, CROP, null).start(); 
 
        // As above, but resize images to 320x240 
        new ImageDownloader("dog", 10, 320, 240, RESIZE, null).start(); 
 
        // Alternative usage: download all the images in a directory: 
        // ( The following are a number of locations for getting faces.) 
 
        // Download all images from this url and suffix all the image filenames with "a" 
        new ImageDownloader("http://www.calvin.edu/news/photos/staff/", "a"); 
 
        //new ImageDownloader("http://news-info.wustl.edu/images/staff/", "b"); 
        //new ImageDownloader("http://www.recsports.vt.edu/staff/photos/", "c"); 
        //new ImageDownloader("http://ib.berkeley.edu/images/staff/", "d"); 
        //new ImageDownloader("http://astron.berkeley.edu/~blitz/Site%20Staff/", "e"); 
        //new ImageDownloader("http://www.benjamin-club.lu/images/staff/", "f"); 
        //new ImageDownloader("http://www.andrew.cmu.edu/org/hamerschlag/", "g"); //??? 
        //new ImageDownloader("http://www.swdc.wa.gov.au/files/staff/", "h"); 
        //new ImageDownloader("http://www.physics.unh.edu/people/studentphotos/", "j"); 
        //new ImageDownloader("http://www.vision.ee.ethz.ch/members/images/spuhler.jpg", "k"); 
 
    } 
 
    /** 
     * The keyword to search for 
     */ 
    protected String keyword; 
 
    /** 
     * Roughly how many images are required. 
     */ 
    protected int imagesRequired; 
 
    /** 
     * If cropping or resizing - what width to use? 
     */ 
    protected int width; 
 
    /** 
     * If cropping or resizing - what height to use? 
     */ 
    protected int height; 
 
    /** 
     * Mode: 
     * One of: LEAVE_ORIGINAL, CROP, or RESIZE 
     */ 
    protected int mode; 
 
    /** 
     * The program to be notified as the images are downloaded. 
     */ 
    ImageWaiter waiter; 
 
    /** 
     * Initialises the Image Downloader with settings but does not commence downloading. 
     * You need to call the run() function to begin downloading via the Thread's start() method. 
     * @param keyword The keyword(s) to search for. Put a plus sign (+) between multiple words. 
     * @param imagesRequired An indication of the number of images required. 
     * @param waiter The program to be notified as new images are downloaded. A waiter is a class 
     * that has implemented the ImageWaiter interface. You may pass null to this variable if you don't have 
     * a waiter class. 
     */ 
    public ImageDownloader(String keyword, int imagesRequired, ImageWaiter waiter) { 
        this(keyword, imagesRequired, -1, -1, LEAVE_ORIGINAL, waiter); 
    } 
 
    /** 
     * Initialises the Image Downloader with settings but does not commence downloading. 
     * You need to call the run() function to begin downloading via the Thread's start() method. 
     * @param keyword The keyword(s) to search for. Put a plus sign (+) between multiple words. 
     * @param imagesRequired An indication of the number of images required. 
     * @param width If the image is to be cropped or resized, it will be of this width. 
     * @param height If the image is to cropped or resized, it will be of this height 
     * @param waiter The program to be notified as new images are downloaded. A waiter is a class 
     * that has implemented the ImageWaiter interface. You may pass null to this variable if you don't have 
     * a waiter class. 
     */ 
    public ImageDownloader(String keyword, int imagesRequired, int width, int height, int mode, ImageWaiter waiter) { 
 
        this.keyword = keyword; 
        this.width = width; 
        this.height = height; 
        this.mode = mode; 
        this.waiter = waiter; 
        this.imagesRequired = imagesRequired; 
 
    } 
 
    /** 
     * Downloads all the images from a specific URL. Useful if you want to extract 
     * all the images from one of those directory listings pages. 
     * @param url 
     * @param filenameSuffix 
     */ 
    public ImageDownloader(String url, String filenameSuffix) { 
 
        this.keyword = filenameSuffix; 
 
        downloadDirectory(url); 
 
    } 
 
    /** 
     * Performs the image search. This has been implemented in a thread-like manner so 
     * a GUI can call it without pausing for a long time. 
     */ 
    public void run() { 
 
        System.out.println("Using ASK Image Search..."); 
 
        int page = 1; 
 
        while (getSuccessCounter() < imagesRequired) { 
            System.out.println("Getting page: " + page); 
            ASKimageSearch("http://images.uk.ask.com/pictures?q=" + keyword + "&page=" + page); 
            page++; 
        } 
 
        System.out.println("Got: " + getSuccessCounter() + " images successfully ( of " + attemptsCounter + " attempts )"); 
 
    } 
 
    private synchronized void ASKimageSearch(String s) { 
 
        try { 
 
            // create URL and make a connection 
            URLConnection connection = new URL(s).openConnection(); 
 
            // Pretend to be a browser 
            connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"); 
 
            // Prepare to download content from the URL 
            DataInputStream in = new DataInputStream(new BufferedInputStream(connection.getInputStream())); 
 
            StringBuilder sb = new StringBuilder(4096); 
 
            // save the url's contents to a string 
            while ((s = in.readLine()) != null) { 
                sb.append(s); 
            } 
 
            String page = sb.toString(); 
 
            Vector<String> images = new Vector<String>(100); 
 
            // find the links we need 
            int index = 0; 
 
            while (true) { 
 
                String startMarker = "imagesrc%3D"; 
                String endMarker = "%26"; 
 
                index = page.indexOf(startMarker, index + 1); 
 
                if (index == -1) break; 
 
                int endIndex = page.indexOf(endMarker, index + 1); 
 
                if (endIndex == -1) break; 
 
                String image = decode(page.substring(index + startMarker.length(), endIndex)); 
 
                // don't add the same image twice 
                if (!images.contains(image)) { 
                    images.add(image); 
                    //System.out.println(image); 
                } 
 
                index = endIndex + 1; 
 
            } 
 
            // finish the connection 
            connection = null; 
 
            // now download the image files 
            for (int i = 0; i < images.size(); i++) { 
                attemptsCounter++; 
                new Downloader(this, images.elementAt(i), keyword + attemptsCounter + ".jpg").start(); 
            } 
 
            // and wait 
 
            try { 
                // wait until all images on page are downloaded 
                System.out.println("Waiting, got " + getSuccessCounter() + " images"); 
                wait(); 
            } catch (InterruptedException e) { 
                System.err.println("!! Interrupted"); 
            } 
 
 
        } catch (Exception e) { 
            System.out.println("!! Exception caught: " + e.getMessage()); 
            e.printStackTrace(); 
        } 
    } 
 
    /** 
     * Downloads all images referred to at a particular URL. 
     * @param url The directory to look at. One of those Apache "index of" pages usually. 
     */ 
    public void downloadDirectory(String url) { 
 
        String s; 
 
        try { 
 
            // create URL and make a connection 
            URLConnection connection = new URL(url).openConnection(); 
 
            // Pretend to be a browser 
            connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"); 
 
            // Prepare to download content from the URL 
            DataInputStream in = new DataInputStream(new BufferedInputStream(connection.getInputStream())); 
 
            // Put the HTML into here 
            StringBuilder sb = new StringBuilder(4096); 
 
            // save the url's contents to a string 
            while ((s = in.readLine()) != null) { 
                sb.append(s); 
            } 
 
            // convert to a String 
            final String page = sb.toString(); 
 
            // create a Vector to hold the image URLs we find 
            Vector<String> images = new Vector<String>(100); 
 
            // to help move through the document 
            int cursor = 0; 
 
            while (true) { 
 
                String startMarker = "href=\""; 
                String endMarker = "\""; 
 
                cursor = page.indexOf(startMarker, cursor + 1); 
 
                if (cursor == -1) break; 
 
                int endIndex = page.indexOf(endMarker, cursor + 1 + startMarker.length()); 
 
                if (endIndex == -1) break; 
 
                String image = page.substring(cursor + startMarker.length(), endIndex); 
 
                // don't add the same image twice 
                if (!images.contains(image) && image.toLowerCase().endsWith(".jpg")) { 
                    images.add(image); 
                } 
 
                cursor = endIndex + 1; 
 
            } 
 
            // finish the connection 
            connection = null; 
 
            if (!url.endsWith("/")) url += "/"; 
 
            System.out.println("Found " + images.size() + " images."); 
 
            // now download the image files 
            for (int i = 0; i < images.size(); i++) { 
                attemptsCounter++; 
                new Downloader(this, url + images.elementAt(i), keyword + attemptsCounter + ".jpg").start(); 
                System.out.println(images.elementAt(i)); 
            } 
 
        } catch (Exception e) { 
            System.out.println("!! Exception caught: " + e.getMessage()); 
            e.printStackTrace(); 
        } 
 
    } 
 
 
    /** 
     * How many attempts have been made to download images? 
     */ 
    private int attemptsCounter = 0; 
 
    /** 
     * How many images have been successfully downloaded? 
     * We need to know this value so we know when to stop 
     * downloading. 
     */ 
    private int successCounter = 0; 
 
    /** 
     * Increments success counter. ImageDownloader uses multiple 
     * download threads, so this needs to be accessed via a synchronised 
     * method. 
     */ 
    private synchronized void incrementSuccessCounter() { 
        successCounter++; 
    } 
 
    /** 
     * Returns the success counter. ImageDownloader uses multiple 
     * download threads, so this needs to be accessed via a synchronised 
     * method. 
     */ 
    private synchronized int getSuccessCounter() { 
        return successCounter; 
    } 
 
    /** 
     * Stores the number of active threads we have. We need to know 
     * this so if all the threads die, the program won't wait indefinitely. 
     */ 
    private int threads = 0; 
 
    /** 
     * Increments the thread counter. ImageDownloader uses multiple 
     * download threads, so this needs to be accessed via a synchronised 
     * method. 
     */ 
    private synchronized void registerThread() { 
        threads++; 
    } 
 
    /** 
     * Decrements the thread counter. Called when a thread has completed downloading 
     * its image. ImageDownloader uses multiple 
     * download threads, so this needs to be accessed via a synchronised 
     * method. 
     */ 
    private synchronized void deregisterThread() { 
        threads--; 
    } 
 
    /** 
     * Returns how many threads are still downloading. The image search won't move onto 
     * the next page of imgaes until all threads have completed. 
     */ 
    private synchronized int countThreads() { 
        return threads; 
    } 
 
    /** 
     * Used to stop the main ImageDownloader thread from pausing while the images are downloaded. 
     * The final thread to deregister checks how many threads there are. If there are no more 
     * threads then the stopWaiting() method is called. 
     */ 
    private synchronized void stopWaiting() { 
        System.out.println("Stopped Waiting, got " + getSuccessCounter() + " images"); 
        notify(); 
    } 
 
 
    /** 
     * Very simple class to convert any URL encoded stuff into a 
     * nice decent ASCII string. 
     */ 
    private String decode(String string) { 
 
        char[] chars = string.toCharArray(); 
 
        StringBuilder builder = new StringBuilder(string.length()); 
 
        for (int i = 0; i < chars.length; i++) { 
            char c = chars[i]; 
 
            if (c == '%') { 
                if (chars[i + 1] == '2' && chars[i + 2] == '5') { 
                    if (chars[i + 3] == '2' && chars[i + 4] == 'F') { 
                        builder.append('/'); 
                        i += 4; 
                        continue; 
                    } 
                    if (chars[i + 3] == '3' && chars[i + 4] == 'A') { 
                        builder.append(":"); 
                        i += 4; 
                        continue; 
                    } 
 
                    builder.append(c); 
 
                } 
            } else { 
                builder.append(c); 
            } 
 
        } 
 
        return builder.toString(); 
 
    } 
 
    /** 
     * Takes an URL, and downloads the image contained therein to a specific 
     * filename. 
     */ 
    class Downloader extends Thread { 
 
        String httpURL; 
        ImageDownloader parent; 
        String filename; 
 
        public Downloader(ImageDownloader parent, String httpURL, String filename) { 
            registerThread(); 
            this.httpURL = httpURL; 
            this.parent = parent; 
            this.filename = filename; 
        } 
 
        public void run() { 
 
            try { 
 
                // ensure the save directory exists, otherwise create it. 
                File directory = new File(SAVE_TO); 
                if (!directory.exists()) { 
                    System.out.println("Creating directory: " + SAVE_TO); 
                    directory.mkdirs(); 
                } 
 
                String Imagefilename = httpURL.substring(httpURL.lastIndexOf('/') + 1).toLowerCase(); 
                if (Imagefilename.endsWith("gif")) return; 
 
                System.out.println("Downloading: " + httpURL); 
 
                // create the url 
                URL url = new URL(httpURL); 
                InputStream in = url.openStream(); 
 
                File file = new File(directory, filename); 
                FileOutputStream out = new FileOutputStream(file); 
 
                byte[] b = new byte[1024]; 
                int len; 
                while ((len = in.read(b)) != -1) { 
                    out.write(b, 0, len); 
                } 
                out.close(); 
 
                System.out.println("Saved: " + filename); 
 
                PixelLoader image = new PixelLoader(file); 
 
                if (!image.loadedOK) { 
 
                    file.delete(); 
 
                } else { 
 
                    switch (mode) { 
                        case RESIZE: 
 
                            // now resize the file 
                            Resizer.resize(file, 250); 
 
                            break; 
 
                        case CROP: 
 
                            // crop the file to a given size. 
                            // if the file is smaller than the minimum size then it is deleted. 
 
                            if (image.getWidth() > width && image.getHeight() > height) { 
 
                                if (image.getWidth() > (width * 3)) { 
                                    // shrink 
                                    Resizer.resize(file, width * 3); 
                                    // re-load 
                                    image = new PixelLoader(file); 
                                } 
 
                                int dleft = image.getWidth() - width; 
                                int dheight = image.getHeight() - height; 
 
                                int rLeft = (int) (Math.random() * dleft); 
                                int rTop = (int) (Math.random() * dheight); 
 
                                SegmentedArea a = new SegmentedArea(rLeft, rTop, width, height); 
 
                                image = image.getSubImage(a); 
 
                                image.saveAs(file); 
 
 
                            } else { 
                                file.delete(); 
                            } 
 
                    } 
 
 
                } 
 
                // indicate success 
                incrementSuccessCounter(); 
 
                // give the image to the waiter. 
                if (waiter != null) waiter.recieveImage(file); 
 
            } catch (FileNotFoundException fe) { 
                System.err.println("File Not Found: " + httpURL); 
            } catch (Exception e) { 
                System.err.println("Exception caught: " + e.getMessage()); 
            } finally { 
                // done - now de-activate myself 
                deregisterThread(); 
            } 
 
 
            if (countThreads() < 1) { 
                // stop waiting 
                parent.stopWaiting(); 
            } 
 
        } 
    } 
}