…a very poorly written recipe–made through copying, pasting, and a whole lot of trial and error–for the Lexington Herald-Leader.
'''
Lexington Herald-Leader Calibre Recipe
'''
# Import the regular expressions module.
import re, string, time
# Import the BasicNewsRecipe class which this class extends.
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import strftime
from datetime import timedelta, date
from time import sleep
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class AdvancedUserRecipe1371040942(BasicNewsRecipe):
title = u'Lexington Herald-Leader'
# A brief description for the ebook.
#
description = u'Lexington Herald-Leader web site ebook created using rss feeds.'
# The max number of articles which may be downloaded from each feed.
#
max_articles_per_feed = 100
filterDuplicates = True
# The max age of articles which may be downloaded from each feed. This is
# specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a
# half days). My default of 1.5 days is the last 36 hours, the point at
# which I've decided 'news' becomes 'old news', but be warned this is not
# so good for the blogs, technology, magazine, etc., and sports feeds.
# You may wish to extend this to 2-5 but watch out ebook creation time will
# increase as well. Setting this to 30 will get everything (AFAICT) as long
# as max_articles_per_feed remains set high.
#
oldest_article = 1.5
# Number of simultaneous downloads. Speeds things up from the defualt of 5.
# If you have a lot of feeds and/or have increased oldest_article above 2
# then you may wish to try increasing simultaneous_downloads to 25-30,
# Or, of course, if you are in a hurry. [I've not tried beyond 20.]
#
simultaneous_downloads = 20
# Timeout for fetching files from the server in seconds. The default of
# 120 seconds, seems somewhat excessive.
#
timeout = 30
# The format string for the date shown on the ebook's first page.
# List of all values: http://docs.python.org/library/time.html
# Default in news.py has a leading space so that's mirrored here.
# As with 'feeds' select/de-select by adding/removing the initial '#',
# only one timefmt should be selected, here's a few to choose from.
#
#timefmt = ' [%a, %d %b %Y]' # [Fri, 14 Nov 2011] (Calibre default)
#timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30]
timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM]
#timefmt = ' [%d %b %Y]' # [14 Nov 2011]
#timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30]
#timefmt = ' [%Y-%m-%d]' # [2011-11-14]
#timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30]
auto_cleanup = True
auto_cleanup_keep = '//div[@class="aside"]|//*[@class="story_content"]'
# Author of this recipe.
__author__ = 'kg4vma'
# Specify English as the language of the RSS feeds (ISO-639 code).
language = 'en_GB'
# Set tags.
tags = 'news, sport, blog'
# Set publisher and publication type.
publisher = 'Lexington Herald-Leader'
publication_type = 'newspaper'
# Disable stylesheets from site.
no_stylesheets = True
# extra_css = '
# .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
# .credit { font-weight: normal; text-align: right; font-size: 50%; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
# .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
# .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
# .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
# .timestamp { font-weight: normal; text-align: left; font-size: 50%; }
# .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
# a:link {text-decoration: none; }
# .date{font-size: 50%; }
# .update{font-size: 50%; }
# .articleBody { }
# .authorId {text-align: left; font-size: 50%; }
# .image {text-align: center;}
# .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;}
# .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;}
# .source {text-align: left; font-size: x-small; }'
# Specifies an override encoding for sites that have an incorrect charset
# specified. Default of 'None' says to auto-detect.
encoding = None
# Sets whether a feed has full articles embedded in it.
use_embedded_content = False
# Removes empty feeds - why keep them!?
remove_empty_feeds = True
feeds = [
(u'Top News', u'http://www.kentucky.com/322/index.rss'),
(u'Local News', u'http://www.kentucky.com/164/index.rss'),
(u'Obituaries', u'http://www.legacy.com/services/obitrss.asp?Source=kentucky'),
(u'Nation / World', u'http://www.kentucky.com/1250/index.rss'),
(u'Opinion', u'http://www.kentucky.com/349/index.rss'),
(u'Life / Neighbors', u'http://www.kentucky.com/131/index.rss'),
(u'Entertainment', u'http://www.kentucky.com/684/index.rss'),
(u'Sports', u'http://www.kentucky.com/268/index.rss'),
(u'Business', u'http://www.kentucky.com/101/index.rss'),
(u'Weather', u'http://weather.bloginky.com/feed/')
]
cover_tag = 'KY_LHL'
def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self)
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.cover_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover