My adventures with eBooks: A calibre recipe for the Cynthiana Democrat

…another poorly written recipe–made through copying, pasting, and a whole lot of trial and error–this time for the Cynthiana Democrat. (It doesn’t really work anymore…)

''' Cynthiana Democrat Calibre Recipe '''


# Import the regular expressions module.

import re, string, time
# Import the BasicNewsRecipe class which this class extends.

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import strftime

from datetime import timedelta, date

from time import sleep

from calibre.web.feeds.recipes import BasicNewsRecipe

from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class AdvancedUserRecipe1371040942(BasicNewsRecipe):

    title          = u'The Cynthiana Democrat'
	# A brief description for the ebook.

    #

    description = u'Cynthiana Democrat web site ebook created using rss feeds.'
    # The max number of articles which may be downloaded from each feed.

    #

    max_articles_per_feed = 100

    filterDuplicates = True

    ignore_duplicate_articles = {'title', 'url'}

    scale_news_images_to_device = True

    recursions=0

    # The max age of articles which may be downloaded from each feed. This is

    # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a

    # half days). My default of 1.5 days is the last 36 hours, the point at

    # which I've decided 'news' becomes 'old news', but be warned this is not

    # so good for the blogs, technology, magazine, etc., and sports feeds.

    # You may wish to extend this to 2-5 but watch out ebook creation time will

    # increase as well. Setting this to 30 will get everything (AFAICT) as long

    # as max_articles_per_feed remains set high.

    #

    oldest_article = 7
    # Number of simultaneous downloads. Speeds things up from the defualt of 5.

    # If you have a lot of feeds and/or have increased oldest_article above 2

    # then you may wish to try increasing simultaneous_downloads to 25-30,

    # Or, of course, if you are in a hurry. [I've not tried beyond 20.]

    #

    simultaneous_downloads = 20
    # Timeout for fetching files from the server in seconds. The default of

    # 120 seconds, seems somewhat excessive.

    #

    timeout = 30
    # The format string for the date shown on the ebook's first page.

    # List of all values: http://docs.python.org/library/time.html

    # Default in news.py has a leading space so that's mirrored here.

    # As with 'feeds' select/de-select by adding/removing the initial '#',

    # only one timefmt should be selected, here's a few to choose from.

    #

    #timefmt = ' [%a, %d %b %Y]'              # [Fri, 14 Nov 2011] (Calibre default)

    #timefmt = ' [%a, %d %b %Y %H:%M]'       # [Fri, 14 Nov 2011 18:30]

    timefmt = ' [%a, %d %b %Y %I:%M %p]'    # [Fri, 14 Nov 2011 06:30 PM]

    #timefmt = ' [%d %b %Y]'                 # [14 Nov 2011]

    #timefmt = ' [%d %b %Y %H:%M]'           # [14 Nov 2011 18.30]

    #timefmt = ' [%Y-%m-%d]'                 # [2011-11-14]

    #timefmt = ' [%Y-%m-%d-%H-%M]'           # [2011-11-14-18-30]

    auto_cleanup = True

    auto_cleanup_keep = '//*[@class="content"]|//*[@class="slide-wrap"]'

    #auto_cleanup_keep = '//div[@class="subsection-photo"]|//div[@class="top-news-pic"]|//div[@class="slide-wrap"]|//div[@class="source"]|//div[@class="news-sidebar"]'
	# Author of this recipe.

    __author__ = 'kg4vma'
    # Specify English as the language of the RSS feeds (ISO-639 code).

    language = 'en_US'
    # Set tags.

    tags = 'news, sport, blog'
    # Set publisher and publication type.

    publisher = 'The Cynthiana Democrat'

    publication_type = 'newspaper'
    # Disable stylesheets from site.

    no_stylesheets = True

    masthead_url = 'http://www.cynthianademocrat.com/sites/www.cynthianademocrat.com/files/cynthianalogov2.png'
    # Specifies an override encoding for sites that have an incorrect charset

    # specified. Default of 'None' says to auto-detect.

    encoding = None
    # Sets whether a feed has full articles embedded in it.

    use_embedded_content = False
    # Removes empty feeds - why keep them!?

    remove_empty_feeds = True
    feeds          = [(u'News', u'http://www.cynthianademocrat.com/todaysnews/rss.xml'),

	                (u'Did you Know', u'http://www.cynthianademocrat.com/rss.xml'),

					(u'Features', u'http://www.cynthianademocrat.com/features/rss.xml'),

                    (u'Obituaries', u'http://www.legacy.com/services/obitrss.asp?Source=cynthianademocrat'),

					(u'Opinion', u'http://www.cynthianademocrat.com/todaysopinions/rss.xml'),

					(u'Business', u'http://www.cynthianademocrat.com/news/business/rss.xml'),

					(u'Sports', u'http://www.cynthianademocrat.com/sports/rss.xml')]

#cover_tag = 'KY_LHL' #def get_cover_url(self): # from datetime import timedelta, date # cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg' # br = BasicNewsRecipe.get_browser(self) # daysback=1 # try: # br.open(cover) # except: # while daysback<7: # cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg' # br = BasicNewsRecipe.get_browser(self) # try: # br.open(cover) # except: # daysback = daysback+1 # continue # break # if daysback==7: # self.log("\nCover unavailable") # cover = None # return cover

One thought on “My adventures with eBooks: A calibre recipe for the Cynthiana Democrat”