Last active
January 1, 2023 22:34
-
-
Save yalov/055e636e6bfc35c7d7b096aa8aa26c0d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Download and add description to ical files from the calend.ru | |
requirements: | |
>= Python3.6 | |
pip install icalendar trafilatura colorama | |
version: 11 | |
Created on Fri Jan 20 21:55:59 2017. @author: yalov | |
Public domain license. | |
""" | |
import argparse | |
import os | |
import re | |
import sys | |
import time | |
import requests | |
import trafilatura | |
import colorama | |
from icalendar import Calendar | |
colorama.init() | |
def download(year): | |
calends = ["ical-belorus.ics", "ical-jew.ics", "ical-russtate.ics", "ical-ukraine.ics", "ical-wholeworld.ics"] | |
# [{"url":None, "name":None}] | |
data = [] | |
for calend in calends: | |
data.append({"url": f"https://www.calend.ru/ical/{calend}?v=yy{year}&b=1", "name":calend }) | |
for d in data: | |
r = requests.get(d["url"], allow_redirects=True) | |
if not os.path.exists(year): | |
os.mkdir(year) | |
path = os.path.join(year, d["name"]) | |
print(path) | |
open(path, 'wb').write(r.content) | |
def get_description(url): | |
"""Get holiday page from calend.ru, return the cleaned summary.""" | |
downloaded = trafilatura.fetch_url(url) | |
repeat = 3 | |
while (not downloaded and repeat > 0): | |
repeat = repeat - 1 | |
print (colorama.Fore.RED + "fetch failed ... " + colorama.Fore.RESET , end = "") | |
time.sleep(2) | |
downloaded = trafilatura.fetch_url(url) | |
print ("fetched ... ", end = "") | |
desc = trafilatura.extract(downloaded) | |
pattern1 = re.compile(r'Фото: .+, .+') | |
desc = re.sub(pattern1, ' ', str(desc)) | |
print ("extracted ... ", end = "") | |
return desc | |
def proceed(source_path, destination_path): | |
parentfolder = os.path.dirname(source_path) | |
print("Proceed: {} -> {}".format(source_path, destination_path)) | |
# open source iCalendar | |
# change Name and Desc. of Calendar, and summary of holiday | |
with open(source_path, 'rb') as g: | |
gcal = Calendar.from_ical(g.read()) | |
count = len(gcal.walk()) - 1 | |
i = 0 | |
for component in gcal.walk(): | |
if component.name == "VCALENDAR": | |
m = re.match(r"^([a-zA-Z.]+)[ -]+([\w ]+)[ -]+(\d+)$", | |
component['X-WR-CALNAME']) | |
name = m.group(2).strip() + '\'' + m.group(3) | |
component['X-WR-CALNAME'] = name | |
print('Name = ', name) | |
with open(os.path.join(parentfolder, "log.txt"), 'a') as f: | |
f.write(name + "\n") | |
f.write("count: {}\n".format(count)) | |
if component.name == "VEVENT": | |
URL = component['COMMENT'].replace('https://www.calend.ruhttps://','https://') | |
Title = component['SUMMARY'] | |
print('({0}/{1}) {2}... '.format(i, count, colorama.Fore.YELLOW + Title + colorama.Fore.RESET), end = "") | |
url_code = "".join([c for c in URL if c.isdigit()]) | |
filename = re.sub(r'[^\w\-\. ]', '_', url_code + " " + Title) # remove anything that is not alphanumeric .,-_ and space | |
filepath = os.path.join(parentfolder, "fetched", filename + ".txt") | |
if os.path.exists(filepath): | |
try: | |
description = open(filepath, encoding='utf-8').read() | |
except: | |
description = open(filepath).read() | |
print (colorama.Fore.GREEN + "read" + colorama.Fore.RESET) | |
else: | |
description = get_description(URL) | |
os.makedirs(os.path.dirname(filepath), exist_ok=True) | |
open(filepath, 'w',encoding='utf-8').write(description) | |
print ("saved") | |
try: | |
time.sleep(0.5) | |
except: | |
print ("Exit.") | |
sys.exit(0) | |
component['DESCRIPTION'] = URL + '\n' + description | |
i += 1 | |
with open(destination_path, 'wb') as f: | |
f.write(gcal.to_ical()) | |
print("Done.\n\n") | |
def proceedfolder(folder): | |
for f in os.listdir(folder): | |
# print (f) | |
if os.path.isfile(folder + "/" + f) and f[-4:] == ".ics" and f[-8:] != "-out.ics": | |
proceed(folder + "/" + f, folder + "/" + f[:-4] + "-out.ics") | |
else: | |
print("Not valid: " + folder + "/" + f) | |
if __name__ == '__main__': | |
print(sys.version) | |
parser = argparse.ArgumentParser(description='Download and add Description to ical files using the calend.ru') | |
parser.add_argument('files', type=str, nargs='*', | |
help="path to file(s) with or without the extension \".ics\" or folder(s)") | |
parser.add_argument('-d', '--download', metavar='year', | |
help="download calendars to year subfolder (ignore files argument)") | |
namespace = parser.parse_args(sys.argv[1:]) | |
if namespace.download: | |
download(namespace.download) | |
exit() | |
for path in namespace.files: | |
if os.path.isfile(path + ".ics"): | |
proceed(path + ".ics", path + "-out.ics") | |
elif os.path.isfile(path) and path[-4:] == '.ics': | |
proceed(path, path[:-4] + "-out.ics") | |
elif os.path.isdir(path): | |
proceedfolder(path) | |
else: | |
print("Not valid: " + path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment