Compare commits

...

22 commits

Author SHA1 Message Date
Tom Willemsen
02524e6780 Moved away from deprecated SGMLlib parser
I'd heard that the SGMLlib parser had been deprecated in favor of
HTMLParser, so now I've moved to that.
2011-06-11 11:38:21 +02:00
Tom Willemsen
4bd89ebded Removed wizard option, never use it
I once implemented it for a friend of mine, but he doesn't use it at
all, so I'd best just let it loose.
2011-06-11 11:34:43 +02:00
Tom Willemsen
05724b37b3 Archlinux uses python2 2011-05-27 11:36:28 +02:00
ryuslash
0323ddbed8 interrupt, save
On keyboard interrupt in the regular flow of downloading the collected images of this session and of last session are saved so as not to destroy the archive accidentally this way.
If an item is added to the new collection, it is removed from the old one.
If a save is being dumped (uncleanly saved because of KB interrupt), the old collection is appended to the new one.
Upon loading the returned string is split to enable removing of the old items.
2010-04-27 21:06:41 +02:00
ryuslash
ef79c9991b Nasty bug where it didn't actually save anything 2010-04-22 22:24:38 +02:00
ryuslash
99b87aeffb Log function works, need to start using it now 2010-04-21 15:40:20 +02:00
ryuslash
b5aac62357 Merge branch 'kirbybase' into develop 2010-04-19 09:16:59 +02:00
ryuslash
af529bcd4e Not checking and kirbybase
Using kirbybase to store downloaded images
A nasty bug that always returned the wrong result when checking whether an image had already been downloaded has been fixed
2010-04-19 09:16:04 +02:00
ryuslash
2db5555609 Archive lookup
4grab was saving the archived images to the wrong place, which is now fixed
2010-04-08 15:14:12 +02:00
ryuslash
9fb2b4ff58 Fixed Python 2.6 compatibility issue
Select was using "raise 'No Match'" to break from non-matches, this is
deprecated
2010-04-08 15:12:26 +02:00
ryuslash
a567e8630f Added kirbybase module 2010-04-08 15:10:48 +02:00
ryuslash
dfaa24b1b6 Error with archive
The archive function in sorter.py didn't archive to .arch, now it does
2010-04-07 23:30:59 +02:00
ryuslash
d58d029202 Seperate download function
download function has been seperated from get_thread_links and get_image_links
2010-03-30 15:59:15 +02:00
ryuslash
7ab6d2911f Merge branch 'non-fixed-pages' into develop 2010-03-25 22:28:32 +01:00
ryuslash
14e2b0cc54 non-fixed page count
4grab no longer assumes either 11 or 16 pages, it will keep trying to collect new pages up to the moment it receives a 404 error
2010-03-25 22:28:08 +01:00
ryuslash
018abb7da1 Merge branch 'by-date' into develop 2010-03-22 15:18:15 +01:00
ryuslash
3980ccf38e Sorter also sorts by date now 2010-03-22 15:17:55 +01:00
ryuslash
4b70374e9d FAIL and print
If source and dest in copy are the same, it is no longer reported
If an image can't be read, it is counted as failed
2010-03-19 16:08:39 +01:00
ryuslash
96247d41d5 Merge branch 'sorting' into develop 2010-03-19 08:26:26 +01:00
ryuslash
7f8dfa1d30 Sorting, multi category, multi resolution
After a file has been downloaded a callback function can now be called.
The callback function I call checks to see if the resolution of the image appears in the collection of resolutions that has been entered in the configuration file and deletes/moves accordingly.
If a file can not be read (which I have noticed happens sometimes), it is removed, not copied and not archived so that it can be retried later.
4grab got a new command-line option, -s --sorter, to sort out old images, running python sorter.py has the same effect, but this seemed pretties.
theoretically multiple categories could now be entered into the configuration file seperated by ',', but this hasn't been tested yet.
mutliple resolutions could be entered into the configuration file, seperated by ',' like so: 1680x1050,1920x1200.
Configuration now checks to see if all the necessary properties are available in the configuration file, if one is missing, it tries to create it.
2010-03-19 00:18:04 +01:00
ryuslash
4a9cc7e2b6 optioncreator, extra options
config now has an optioncreator property that will allow another module to select how to prompt for a property
resolutions and archive options have been added to config file for sorting functionality
2010-03-17 23:11:18 +01:00
ryuslash
8e101c92f9 Disconnected the property creation function
The function used to create a new property has been changed to be a callback function, so that later on a UI might use a dialog.
2010-03-16 23:17:46 +01:00
10 changed files with 2555 additions and 215 deletions

1
.gitignore vendored
View file

@ -2,3 +2,4 @@
*.pyc
\#*\#
.*
modules/plane.tbl

118
4grab.py
View file

@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/python2
######################################################################
# Copyright 2009, 2010 ryuslash
@ -21,51 +21,25 @@
import optparse
import sys
import os
import config
from util import raw_input_with_default
import util
import download
import progressbar
import sorter
import backend
config._optioncreator = raw_input_with_default
base_url = "http://boards.4chan.org/"
parser = optparse.OptionParser()
downloader = download.Downloader(progressbar.Progress)
def walk_with_wizard(baseurl):
wzrd_msg = "Pilates! *SHAZAM* Here they come!"
print "Alright, let me put on my robe and wizard hat."
# Single or all
inp = None
prompt = "Would you like to download a single thread, or all? "
inp = raw_input(prompt)
while (inp != "single" and inp != "all"):
print "Please type single or all"
inp = raw_input(prompt)
if inp == "single":
inp = raw_input("Which thread would you like to download? ")
if inp[:7] == "http://":
t = downloader.get_image_links("", [inp])
else:
thread = inp
inp = raw_input("Which category is this thread in? ")
print wzrd_msg
t = downloader.get_image_links("%s%s/res/" % (baseurl, inp),
[thread])
else:
inp = raw_input("Which category would you like to download? ")
config.Configuration().set_category(inp)
baseurl = "%s%s/" % (baseurl, config.Configuration().get_category())
print wzrd_msg
t = downloader.get_thread_links(baseurl)
t = downloader.get_image_links(baseurl, t)
(skipped, failed, downloaded, total) = downloader.get_images(t)
print "Downloaded: ", downloaded
print "Skipped: ", skipped
print "Failed: ", failed
print "Total: ", total
def parse_commands():
conf = config.Configuration()
parser.set_usage(
"""%prog [options]
@ -89,43 +63,42 @@ parser.add_option("-t",
metavar="THREAD",
help="Download only THREAD. If THREAD is only an ID, "
"CATEGORY must also be set. Otherwise, no problem :-)")
parser.add_option("-w",
"--wizard",
parser.add_option("-s",
"--sort",
action="store_true",
dest="wizard",
help="I'll put on my robe and wizard hat and help you "
"get some of those pictures you like")
dest="sort",
help="Sort downloaded images, most handy if you've used "
"older versions which didn't sort yet")
parser.add_option("-l",
"--loglevel",
nargs=1,
dest="loglevel",
metavar="LEVEL",
help="Changes the default log level to LEVEL")
(options, args) = parser.parse_args()
if options.confval and (options.tempcat
or options.thread
or options.wizard):
or options.wizard
or options.sort):
print "Can't configure something and do something else too."
exit(1)
if options.wizard and (options.tempcat
or options.thread
or options.confval):
print "Can't take a walk with the wizard and do something else too."
exit(1)
if options.confval:
if not config.Configuration().option_exists(options.confval[0]):
print ("%s: error: %s is not a "
"valid configuration option") % (sys.argv[0],
options.confval[0])
exit(1)
print "Setting", options.confval[0], "to", options.confval[1]
config.Configuration().set_option(options.confval[0],
options.confval[1])
config.Configuration().save()
if options.sort:
sort = sorter.Sorter()
for item in os.listdir(conf.get_download_location()):
sort.act(item)
exit(0)
elif options.wizard:
try:
walk_with_wizard(base_url)
except KeyboardInterrupt:
print
print "Alright, no more wizard hat and robe then. Goodbye"
if options.confval:
if not conf.option_exists(options.confval[0]):
print ("%s: error: %s is not a valid configuration option"
% (sys.argv[0], options.confval[0]))
exit(1)
print "Setting", options.confval[0], "to", options.confval[1]
conf.set_option(options.confval[0],
options.confval[1])
conf.save()
exit(0)
elif options.thread:
@ -150,10 +123,19 @@ elif options.thread:
exit(0)
elif options.tempcat:
config.Configuration().set_category(options.tempcat)
conf.set_categories([options.tempcat])
base_url = "%s%s/" % (base_url, config.Configuration().get_category())
elif options.loglevel is not None:
util.loglevel = util.LogType.from_int(options.loglevel)
if __name__ == "__main__":
conf = config.Configuration()
sort = sorter.Sorter()
parse_commands()
downloader.set_on_downloaded(sort.act)
for category in conf.get_categories():
base_url = "%s%s/" % (base_url, category)
try:
t = downloader.get_thread_links(base_url)
t = downloader.get_image_links(base_url, t)
@ -163,5 +145,9 @@ try:
print "Failed: ", failed
print "Total: ", total
except KeyboardInterrupt:
be = backend.Backend()
be.save(True) # Make sure that the downloaded images are saved anyway
print
print "So you don't want these images? Fine! I'll stop then."
util.log(util.LogType.Err, "Quit on user request")

70
backend.py Normal file
View file

@ -0,0 +1,70 @@
import os
import modules.kirbybase
from util import confdir,raw_input_with_default
import config
class _Backend(object):
""" A class that communicates with the datastore """
def __init__(self):
self.table = os.path.join(confdir, "images.tbl")
self.store = modules.kirbybase.KirbyBase()
self.__collection = ""
self.__new_collection = []
self.load()
def create_store_if_needed(self):
if not os.path.exists(self.table):
return self.store.create(self.table, ["filename:String"])
return True
def add(self, filename):
if filename in self.__collection:
self.__collection.remove(filename)
self.__new_collection.append(filename)
def check(self, filename):
collected = filename in self.__collection
downloaded = filename in self.__new_collection
if not downloaded:
self.add(filename)
if collected or downloaded:
return True
return False
def save(self, dump = False):
if dump:
self.__new_collection.extend(self.__collection)
if os.path.exists(self.table):
os.remove(self.table)
self.create_store_if_needed()
for f in self.__new_collection:
self.store.insert(self.table, [f])
def load(self):
if os.path.exists(self.table):
collection = self.store.select(self.table, ['recno'], ['*'], ['filename'], returnType="report")
if collection != '':
self.__collection = collection.split()
_backend = None
def Backend():
global _backend
if _backend == None:
_backend = _Backend()
return _backend
if __name__ == "__main__":
backend = Backend()
config._optioncreator = raw_input_with_default
cfg = config.Configuration()
for f in os.listdir(cfg.get_archive_location()):
backend.add(f)

118
config.py
View file

@ -20,43 +20,94 @@
import os
import ConfigParser
import sys
homedir = os.getenv("HOME")
if homedir is None:
homedir = os.path.dirname(sys.argv[0])
from util import homedir, confdir
class _Configuration(object):
def __init__(self):
self.filename = os.path.join(os.path.join(homedir, ".4grab"), "config.cfg")
def __init__(self, optioncreator):
self.filename = os.path.join(confdir, "config.cfg")
self.configparser = ConfigParser.RawConfigParser()
if not os.path.exists(self.filename):
self.create_new()
else:
self.optioncreator = optioncreator
self.configparser.read(self.filename)
def create_new(self):
self.configparser.add_section("settings")
self.set_category(self.raw_input_with_default("w", "Please enter which category you would like to download from: "))
def check(self):
changed = False
# read if it exists
if os.path.exists(self.filename):
self.configparser.read(self.filename)
# locations
if not self.configparser.has_section("locations"):
self.configparser.add_section("locations")
self.configparser.set("locations", "download", self.raw_input_with_default(os.path.join(homedir, "Pictures"), "Please enter where you would like the downloads to go: "))
# locations/download_base
if not self.configparser.has_option("locations", "download_base"):
self.create_option("locations",
"download_base",
os.path.join(homedir,
"Pictures"),
"Please enter where "
"you would like the "
"downloads to go: ")
changed = True
# locations/archive
if not self.configparser.has_option("locations", "archive"):
self.create_option("locations",
"archive",
os.path.join(self.configparser.get("locations",
"download_base"),
".arch"),
"Please enter where in {download_base} you "
"would like to store archived images (used for "
"checking what to download): ")
changed = True
# settings
if not self.configparser.has_section("settings"):
self.configparser.add_section("settings")
# settings/categories
if not self.configparser.has_option("settings", "categories"):
self.create_option("settings",
"categories",
"w",
"Please enter which "
"category you would like "
"to download from: ")
changed = True
# settings/resolutions
if not self.configparser.has_option("settings", "resolutions"):
self.create_option("settings",
"resolutions",
"1600x1050,1900x1200,1900x1080",
"Please enter your preferred "
"resolutions (* for all)")
changed = True
# save
if changed:
self.save()
def raw_input_with_default(self, default, prompt):
inp = raw_input("%s (default=%s): " % (prompt, default))
if inp == "":
return default
return inp
def create_option(self, section, name, default, message):
self.configparser.set(section,
name,
self.optioncreator(default,
message))
def get_download_location(self):
return self.configparser.get("locations", "download")
return self.configparser.get("locations", "download_base")
def set_download_location(self, value):
self.configparser.set("locations", "download_base", value)
def get_category(self):
return self.configparser.get("settings", "category")
def get_archive_location(self):
return self.configparser.get("locations", "archive")
def set_archive_location(self, value):
self.configparser.set("locations", "archive", value)
def set_category(self, value):
self.configparser.set("settings", "category", value)
def get_categories(self):
return self.configparser.get("settings", "categories").split(',')
def set_categories(self, value = []):
self.configparser.set("settings", "category", ','.join(value))
def get_resolutions(self):
return self.configparser.get("settings", "resolutions").split(',')
def set_resolutions(self, value = []):
self.configparser.set("settings", "resolutions", ','.join(value))
def option_exists(self, option):
sections = self.configparser.sections()
@ -65,6 +116,7 @@ class _Configuration(object):
return True
return False
# Should only be used by the command-line
def set_option(self, option, value):
sec = None
sections = self.configparser.sections()
@ -81,9 +133,19 @@ class _Configuration(object):
def save(self):
dirname = os.path.dirname(self.filename)
if not os.path.exists(dirname):
os.mkdir(dirname)
os.makedirs(dirname)
configfile = open(self.filename, "w")
self.configparser.write(configfile)
_configuration = _Configuration()
def Configuration(): return _configuration
_configuration = None
_optioncreator = None
def Configuration():
global _optioncreator
global _configuration
if _optioncreator is None:
raise ValueError("optioncreator must be set")
if _configuration is None:
_configuration = _Configuration(_optioncreator)
_configuration.check()
return _configuration

View file

@ -1,5 +1,3 @@
#!/usr/bin/env python
######################################################################
# Copyright 2009, 2010 ryuslash
#
@ -22,28 +20,36 @@
import urllib
import os
import htmlparser
#import progressbar
import config
import sys
import backend
import util
savedir = config.Configuration().get_download_location()
def get_savedir():
conf = config.Configuration()
savedir = conf.get_download_location()
if not os.path.exists(savedir):
os.makedirs(savedir)
return savedir
def check_archive(fullpath):
filename = os.path.basename(fullpath)
be = backend.Backend()
return be.check(filename)
def write(message):
sys.stdout.write(message)
sys.stdout.flush()
class Downloader(object):
def __init__(self, progress_reporter):
self.progress_reporter = progress_reporter
self.on_downloaded = None
def get_thread_links(self, baseurl):
myparser = htmlparser.MyParser()
t = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
i = 1
total = len(t)
progress = self.progress_reporter(total)
def set_on_downloaded(self, on_downloaded):
self.on_downloaded = on_downloaded
for pagenum in t:
progress.show_progress(i)
url = baseurl + pagenum
def download(self, url):
f = None
tries = 10
while tries > 0:
try:
@ -51,8 +57,25 @@ class Downloader(object):
break
except IOError:
tries -= 1
print "\rTry of", url, "failed,", tries, "tries left"
write("\rTry of %s failed, %d tries left" % (url, tries))
return f
def get_thread_links(self, baseurl):
myparser = htmlparser.MyParser()
i = 0
code = 0
url = None
while code != 404:
url = baseurl + str(i)
f = self.download(url)
if not f is None:
code = f.getcode()
if code == 404:
write("\rCollected %d pages\n" % i)
f.close()
continue
# Read the response
s = f.read()
f.close()
@ -60,10 +83,11 @@ class Downloader(object):
# Process the page.
myparser.parse(s)
else:
"\rOpening of", url, "did not succeed, trying next one..."
write("\rOpening of %s did not succeed, trying next one..." \
% url)
i += 1
write("\rCollected %d pages" % i)
progress.complete()
return myparser.get_hyperlinks()
def get_image_links(self, baseurl, t = []):
@ -76,21 +100,16 @@ class Downloader(object):
progress.show_progress(i)
img_url = baseurl + link
tries = 10
while tries > 0:
try:
f = urllib.urlopen(img_url)
break
except IOError:
tries -= 1
print "\rTry of", img_url, "failed,", tries, "tries left"
f = self.download(img_url)
if not f is None:
s = f.read()
f.close()
mysubparser.parse(s)
else:
print "\rOpening of", img_url, "did not succeed, trying next one..."
write("\rOpening of %s did not succeed, " \
"trying next one..." % img_url)
i += 1
progress.complete()
@ -105,8 +124,10 @@ class Downloader(object):
i = 1
for link in t:
progress.show_progress(i)
filename = os.path.join(savedir, os.path.split(link)[1])
if not os.path.exists(filename):
filename = os.path.join(get_savedir(), os.path.split(link)[1])
if not check_archive(filename):
util.log(util.LogType.Msg, "%s is not in archive" % filename, None)
tries = 10
while tries > 0:
try:
@ -117,19 +138,25 @@ class Downloader(object):
if tries == 0:
failed += 1
else:
util.log(util.LogType.Msg, "succsesfully downloaded %s" % filename, None)
downloaded += 1
if self.on_downloaded is not None:
util.log(util.LogType.Msg, "", self.on_downloaded)
if not self.on_downloaded(filename):
failed += 1
else:
util.log(util.LogType.Warn, "on_downloaded is None", None)
else:
skipped += 1
i += 1
progress.complete()
be = backend.Backend()
be.save()
return (skipped, failed, downloaded, total)
if __name__ == "__main__":
# Get a file-like object for the 4chan.org w/imgboard
base_url = "http://boards.4chan.org/" + config.Configuration().get_category() + "/"
# Get the hyperlinks.
t = get_thread_links(base_url)
t = get_image_links(base_url, t)
get_images(t)
print "Don't run me, run 4grab.py"

View file

@ -17,33 +17,35 @@
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
######################################################################
import sgmllib
from HTMLParser import HTMLParser, HTMLParseError
import re
class MyParser(sgmllib.SGMLParser):
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
class MyParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.hyperlinks = []
self.url_reg = re.compile('res/\d+\Z')
self.prev = ""
def parse(self, s):
self.feed(s)
self.close()
def start_a(self, attributes):
for name, value in attributes:
if name == "href":
def handle_starttag(self, tag, attrs):
prev = ""
if tag == 'a':
for name, value in attrs:
if name == 'href':
if self.url_reg.search(value) != None:
if self.prev != value:
if prev != value:
self.hyperlinks.append(value)
self.prev = value
prev = value
def get_hyperlinks(self):
return self.hyperlinks
class MySubParser(MyParser):
def __init__(self, verbose=0):
MyParser.__init__(self, verbose)
def __init__(self):
MyParser.__init__(self)
self.url_reg = re.compile('/src/\d+\.\w{3,4}\Z')

0
modules/__init__.py Normal file
View file

2035
modules/kirbybase.py Normal file

File diff suppressed because it is too large Load diff

118
sorter.py Normal file
View file

@ -0,0 +1,118 @@
######################################################################
# Copyright 2009, 2010 ryuslash
#
# This file is part of 4grab.
#
# 4grab is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 4grab is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 4grab. If not, see <http://www.gnu.org/licenses/>.
######################################################################
import config
import Image
import shutil
import os
import datetime
import backend
import util
def dummy_option_creator(value1, value2): pass
config._optioncreator = dummy_option_creator
class Sorter:
def __init__(self):
self.conf = config.Configuration()
self.resolutions = self.conf.get_resolutions()
def act(self, filename):
util.log(util.LogType.Msg, "sorter is acting", filename)
download_base = self.conf.get_download_location()
retval = True
if self.check_filename(filename):
image = None
try:
image = Image.open(os.path.join(download_base,
filename))
except IOError:
retval = False
if not image == None and self.archive_check(filename):
util.log(util.LogType.Msg, "Checking resolution", {"filename":filename, "resolution":image.size})
for resolution in self.resolutions:
resolution = resolution.split('x')
foldername = "%s-%s" % (resolution[0],
resolution[1])
folderpath = os.path.join(download_base,
foldername)
if str(image.size[0]) == resolution[0] and \
str(image.size[1]) == resolution[1]:
if not os.path.exists(folderpath):
os.makedirs(folderpath)
self.copy(filename, folderpath)
break
self.archive(filename)
self.remove(filename)
return retval
def copy(self, filename, destpath):
download_base = self.conf.get_download_location()
source = os.path.join(download_base,
os.path.basename(filename))
today = datetime.date.today()
dest = os.path.join(destpath,
"%d-%d-%d" % (today.year, today.month, today.day))
util.log(util.LogType.Msg, "going to copy %s to %s" % (source, dest), None)
if not os.path.exists(dest):
os.makedirs(dest)
dest = os.path.join(dest,
os.path.basename(filename))
if source != dest:
shutil.copy(source, dest)
else:
print "\nHow can this even happen?! Copying", source, "to", dest
def archive(self, filename):
be = backend.Backend()
be.add(os.path.basename(filename))
def archive_check(self, filename):
be = backend.Backend()
return be.check(os.path.basename(filename))
def check_filename(self, filename):
ext = os.path.splitext(filename)[1]
return ext == ".jpg" or \
ext == ".png" or \
ext == ".gif"
def remove(self, filename):
download_base = self.conf.get_download_location()
source = os.path.join(download_base, filename)
os.remove(source)
if __name__ == "__main__":
conf = config.Configuration()
download_base = conf.get_download_location()
sorter = Sorter()
for item in os.listdir(download_base):
sorter.act(item)

39
util.py Normal file
View file

@ -0,0 +1,39 @@
import os
import sys
class LogType:
Non = 0
Err = 1
Warn = 2
Msg = 3
@staticmethod
def from_int(lloglevel):
iloglevel = int(lloglevel)
if iloglevel == 0:
return LogType.Non
if iloglevel == 1:
return LogType.Err
if iloglevel == 2:
return LogType.Warn
if iloglevel == 3:
return LogType.Msg
loglevel = LogType.Non
def raw_input_with_default(default, prompt):
inp = raw_input("%s (default=%s): " % (prompt, default))
if inp == "":
return default
return inp
def log(logtype, message, data = None):
global loglevel
if loglevel >= logtype:
print message
if not data is None:
print "data:\n\tdata"
homedir = os.getenv("HOME")
if homedir is None:
homedir = os.path.dirname(sys.argv[0])
confdir = os.path.join(homedir, ".4grab")