Skip to content

Instantly share code, notes, and snippets.

@aespinosa
Created October 6, 2010 15:46
Show Gist options
  • Save aespinosa/613551 to your computer and use it in GitHub Desktop.
Save aespinosa/613551 to your computer and use it in GitHub Desktop.
OSG site tester
#!/usr/bin/env ruby
# File: mk_test.rb
# Date: 2010-10-06
# Author: Allan Espinosa
# Email: [email protected]
# Description: A Swift workflow generator to test OSG sites through the Engage
# VO. Generates the accompanying tc.data and sites.xml as well.
# Run with "swift -sites.file sites.xml -tc.file tc.data
# test_osg.swift"
require 'erb'
require 'ostruct'
require 'ress'
swift_tc = %q[
localhost echo /bin/echo INSTALLED INTEL32::LINUX GLOBUS::maxwalltime="00:05:00"
<% ctr = 0
sites.each_key do |name| %>
<% jm = sites[name].jm
url = sites[name].url
app_dir = sites[name].app_dir
data_dir = sites[name].data_dir
throttle = sites[name].throttle %>
<%=name%> cat<%=ctr%> /bin/cat INSTALLED INTEL32::LINUX GLOBUS::maxwalltime="00:01:30"
<% ctr += 1
end %>
]
swift_workflow = %q[
type file;
app (file t) echo(string i) {
echo i stdout=@filename(t);
}
<% ctr = 0
sites.each_key do |name| %>
app (file t) cat<%= ctr %>(file input ) {
cat<%= ctr %> @filename(input) stdout=@filename(t);
}
<% ctr += 1
end %>
<% ctr = 0
sites.each_key do |name| %>
file input<%= ctr %><"cat<%= ctr %>.in">;
input<%= ctr %> = echo("<%= name %>");
file out<%= ctr %><"cat<%= ctr %>.out">;
out<%= ctr %> = cat<%= ctr %>(input<%= ctr %>);
<% ctr += 1
end %>
]
condor_sites = %q[
<config>
<pool handle="localhost">
<filesystem provider="local" />
<execution provider="local" />
<workdirectory >/var/tmp</workdirectory>
<profile namespace="karajan" key="jobThrottle">0</profile>
</pool>
<% sites.each_key do |name| %>
<% jm = sites[name].jm
url = sites[name].url
app_dir = sites[name].app_dir
data_dir = sites[name].data_dir
throttle = sites[name].throttle %>
<pool handle="<%=name%>">
<execution provider="condor" url="none"/>
<profile namespace="globus" key="jobType">grid</profile>
<profile namespace="globus" key="gridResource">gt2 <%=url%>/jobmanager-<%=jm%></profile>
<profile namespace="karajan" key="initialScore">20.0</profile>
<profile namespace="karajan" key="jobThrottle"><%=throttle%></profile>
<gridftp url="gsiftp://<%=url%>"/>
<workdirectory><%=data_dir%>/swift_scratch</workdirectory>
</pool>
<% end %>
</config>
]
# Blacklist of non-working sites
blacklist = [ ]
# Removes duplicate site entries (i.e. multilpe GRAM endpoints)
sites = {}
ress_parse do |name, value|
next if blacklist.index(name)
sites[name] = value if sites[name] == nil
end
condor_out = File.open("sites.xml", "w")
tc_out = File.open("tc.data", "w")
swift_out = File.open("test_osg.swift", "w")
condor = ERB.new(condor_sites, 0, "%<>")
tc = ERB.new(swift_tc, 3, "%<>")
swift = ERB.new(swift_workflow, 0, "%<>")
condor_out.puts condor.result(binding)
tc_out.puts tc.result(binding)
swift_out.puts swift.result(binding)
condor_out.close
tc_out.close
swift_out.close
require 'ostruct'
def ress_query(class_ads)
cmd = "condor_status -pool engage-submit.renci.org"
class_ads[0..-2].each do |class_ad|
cmd << " -format \"%s|\" #{class_ad}"
end
cmd << " -format \"%s\\n\" #{class_ads[-1]}"
`#{cmd}`
end
def ress_parse
dir_suffix = "/engage/swift"
class_ads = [
"GlueSiteUniqueID", "GlueCEInfoHostName", "GlueCEInfoJobManager",
"GlueCEInfoGatekeeperPort", "GlueCEInfoApplicationDir", "GlueCEInfoDataDir",
"GlueCEInfoTotalCPUs"
]
ress_query(class_ads).each_line do |line|
line.chomp!
set = line.split("|")
next if not set.size > 0
value = OpenStruct.new
value.jm = set[class_ads.index("GlueCEInfoJobManager")]
value.url = set[class_ads.index("GlueCEInfoHostName")]
value.throttle = (set[class_ads.index("GlueCEInfoTotalCPUs")].to_f - 2.0) / 100.0
name = set[class_ads.index("GlueSiteUniqueID")] + "__" + value.url
value.name = set[class_ads.index("GlueSiteUniqueID")]
value.app_dir = set[class_ads.index("GlueCEInfoApplicationDir")]
value.app_dir.sub!(/\/$/, "")
value.data_dir = set[class_ads.index("GlueCEInfoDataDir")]
value.data_dir.sub!(/\/$/, "")
value.app_dir = "/osg/app" if name =~ /GridUNESP_CENTRAL/
value.data_dir = "/osg/data" if name =~ /GridUNESP_CENTRAL/
if name =~ /BNL-ATLAS/
value.app_dir += "/engage-scec"
value.data_dir += "/engage-scec"
#elsif name == "LIGO_UWM_NEMO" or name == "SMU_PHY" or name == "UFlorida-HPC" or name == "RENCI-Engagement" or name == "RENCI-Blueridge"
#value.app_dir += "/osg/scec"
#value.data_dir += "/osg/scec"
else
value.app_dir += dir_suffix
value.data_dir += dir_suffix
end
yield name, value
end
end
sites.file=sites.xml
tc.file=tc.data
#
# The host name of the submit machine is used by GRAM as a callback
# address to report the status of submitted jobs. In general, Swift
# can automatically detect the host name of the local machine.
# However, if the machine host name is improperly configured or if
# it does not represent a valid DNS entry, certain services (such as
# GRAM) will not be able to send job status notifications back to
# the client. The value of this property can be an IP address.
#
# Format:
# hostname=string
#
#hostname=localhost
#
# A TCP port range can be specified to restrict the ports on which GRAM
# callback services are started. This is likely needed if your submit
# host is behind a firewall, in which case the firewall should be
# configured to allow incoming connections on ports in the range.
#
# Format:
# tcp.port.range=start,end
#
#tcp.port.range=50000,50100
#
# false - means an error will be immediately reported and cause the
# workflow to abort. At this time remote jobs that are already
# running will not be canceled
# true - means that Swift will try to do as much work as possible and
# report all errors encountered at the end. However, "errors"
# here only applies to job execution errors. Certain errors
# that are related to the Swift implementation (should such
# errors occur) will still be reported eagerly.
#
# Default: false
#
lazy.errors=true
#
# What algorithm to use for caching of remote files. LRU (as in what
# files to purge) is the only implementation right now. One can set
# a target size (in bytes) for a host by using the swift:storagesize
# profile for a host in sites.xml
#
# Default: LRU
#
caching.algorithm=LRU
#
# true - generate a provenance graph in .dot format (Swift will
# choose a random file name)
# false - do not generate a provenance graph
# <filename> - generate a provenange graph in the give file name
#
# Default: false
#
pgraph=false
#
# graph properties for the provenance graph (.dot specific)
#
# Default: splines="compound", rankdir="TB"
#
pgraph.graph.options=splines="compound", rankdir="TB"
#
# node properties for the provenance graph (.dot specific)
#
# Default: color="seagreen", style="filled"
#
pgraph.node.options=color="seagreen", style="filled"
#
# true - clustering of small jobs is enabled. Clustering works in the
# following way: If a job is clusterable (meaning that it has the
# GLOBUS::maxwalltime profile specified in tc.data and its value
# is less than the value of the "clustering.min.time" property) it will
# be put in a clustering queue. The queue is processed at intervals
# specified by the "clustering.queue.delay" property. The processing
# of the clustering queue consists of selecting compatible jobs and
# grouping them in clusters whose max wall time does not exceed twice
# the value of the "clustering.min.time" property. Two or more jobs are
# considered compatible if they share the same site and do not have
# conflicting profiles (e.g. different values for the same environment
# variable).
# false - clustering of small jobs is disabled.
#
# Default: false
#
clustering.enabled=false
#
# <seconds> - the intervals at which the clustering queue is processed
#
# Default: 4
#
clustering.queue.delay=60
#
# <seconds> - the threshold time for clustering
#
# Default: 60
#
clustering.min.time=3600
#
# Kickstart is a useful tool that can be used to gather various information
# about a remote process. Before it can be used it must be installed on the
# remote site and the corresponding entry be set in the sites file.
# This option allows controlling of how Swift uses Kickstart. The following
# values are possible:
# false - do not use Kickstart
# true - use Kickstart. If a job is scheduled on a site that does not have
# Kickstart installed, that job will fail.
# maybe - Use Kickstart if installed (i.e. the entry is present in the sites
# file)
#
# Default: maybe
#
kickstart.enabled=maybe
#
# Indicates when Kickstart records should be fetched from the remote site:
# true - always transfer Kickstart records if Kickstart was used (see
# kickstart.enabled)
# false - only transfer Kickstart records if the job fails
#
# Default: false
#
kickstart.always.transfer=false
#
# Indicates when wrapper logs should be fetched from the remote site:
# true - always transfer wrapper logs
# false - only transfer wrapper logs if the job fails
#
# Default: false
#
wrapperlog.always.transfer=false
###########################################################################
# Throttling options #
###########################################################################
#
# For the throttling parameters, valid values are either a positive integer
# or "off" (without the quotes).
#
#
# Limits the number of concurrent submissions for a workflow instance. This
# throttle only limits the number of concurrent tasks (jobs) that are being
# sent to sites, not the total number of concurrent jobs that can be run.
# The submission stage in GRAM is one of the most CPU expensive stages (due
# mostly to the mutual authentication and delegation). Having too many
# concurrent submissions can overload either or both the submit host CPU
# and the remote host/head node causing degraded performance.
#
# Default: 4
#
throttle.submit=4
#throttle.submit=off
#
# Limits the number of concurrent submissions for any of the sites Swift will
# try to send jobs to. In other words it guarantees that no more than the
# value of this throttle jobs sent to any site will be concurrently in a state
# of being submitted.
#
# Default: 2
#
throttle.host.submit=2
#throttle.host.submit=off
#
# The Swift scheduler has the ability to limit the number of concurrent jobs
# allowed on a site based on the performance history of that site. Each site
# is assigned a score (initially 1), which can increase or decrease based
# on whether the site yields successful or faulty job runs. The score for a
# site can take values in the (0.1, 100) interval. The number of allowed jobs
# is calculated using the following formula:
# 2 + score*throttle.score.job.factor
# This means a site will always be allowed at least two concurrent jobs and
# at most 2 + 100*throttle.score.job.factor. With a default of 4 this means
# at least 2 jobs and at most 402.
#
# Default: 4
#
throttle.score.job.factor=0.2
#throttle.score.job.factor=off
#
# Limits the total number of concurrent file transfers that can happen at any
# given time. File transfers consume bandwidth. Too many concurrent transfers
# can cause the network to be overloaded preventing various other signalling
# traffic from flowing properly.
#
# Default: 4
#
throttle.transfers=4
#throttle.transfers=off
# Limits the total number of concurrent file operations that can happen at any
# given time. File operations (like transfers) require an exclusive connection
# to a site. These connections can be expensive to establish. A large number
# of concurrent file operations may cause Swift to attempt to establish many
# such expensive connections to various sites. Limiting the number of concurrent
# file operations causes Swift to use a small number of cached connections and
# achieve better overall performance.
#
# Default: 8
#
throttle.file.operations=8
#throttle.file.operations=off
# Indicates whether the working directory on the remote site should be
# left intact even when the workflow completes successfully. This can be
# used to inspect the site working directory for debugging purposes.
#
# Default: false
#
sitedir.keep=false
# number of time a job will be retried if it fails (giving a maximum of
# 1 + execution.retries attempts at execution)
#
execution.retries=0
# Enables/disables replication. Replication is used to deal with jobs sitting
# in batch queues for abnormally large amounts of time. If replication is enabled
# and certain conditions are met, Swift creates and submits replicas of jobs, and
# allows multiple instances of a job to compete.
#
replication.enabled=false
# If replication is enabled, this value specifies the minimum time, in seconds,
# a job needs to be queued in a batch queue in order to be considered for
# replication
#
replication.min.queue.time=180
# The maximum number of replicas that Swift should attempt.
replication.limit=3
#
# WARNING: This option is deprecated. Please use the hostname option.
#
# The IP address of the submit machine is used by GRAM as a callback
# address to report the status of submitted jobs. In general, Swift
# can automatically detect the IP address of the local machine.
# However, if the machine has more than one network interface, Swift
# will pick the first one, which may not be the right choice. It is
# recommended that this property is set properly before attempting to
# run jobs through GRAM.
#
# Format:
# ip.address=x.y.z.w
#
#ip.address=127.0.0.1
# Controls how Swift will communicate the result code of running user programs
# from workers to the submit side. In files mode, a file
# indicating success or failure will be created on the site shared filesystem.
# In provider mode, the execution provider job status will
# be used. Notably, GRAM2 does not return job statuses correctly, and so
# provider mode will not work with GRAM2. With other
# providers, it can be used to reduce the amount of filesystem access compared
# to files mode.
#
status.mode=files
# Controls how swift will supply parameters to the remote wrapper script.
# 'args' mode will pass parameters on the command line
# 'files' mode will pass parameters through an additional input file
#
# valid values: args, files
# Default: files
#
# wrapper.parameter.mode=args
# Determines if Swift remote wrappers will be executed by specifying an
# absolute path, or a path relative to the job initial working directory
#
# valid values: absolute, relative
# wrapper.invocation.mode=absolute
#
# Limits the number of concurrent iterations that each foreach statement
# can have at one time. This conserves memory for swift programs that
# have large numbers of iterations (which would otherwise all be executed
# in parallel).
#
# Default: 1024
#
foreach.max.threads=10000
# controls whether the log file will contain provenance information
# enabling this will increase the size of log files, sometimes
# significantly.
provenance.log=false
# Controls whether file staging is done by swift or by the execution
# provider. If set to false, the standard swift staging mechanism is
# used. If set to true, swift does not stage files. Instead, the
# execution provider is instructed to stage files in and out.
#
# Provider staging is experimental.
#
# When enabled, and when coasters are used as an execution provider,
# a staging mechanism can be selected for each site
# using the swift:stagingMethod site profile in sites.xml. The
# following is a list of accepted mechanisms:
#
# * file: Staging is done from a filesystem accessible to the
# coaster service (typically running on the head node)
# * proxy: Staging is done from a filesystem accessible to the
# client machine that swift is running on, and is proxied
# through the coaster service
# * sfs: (short for "shared filesystem") Staging is done by
# copying files to and from a filesystem accessible
# by the compute node (such as an NFS or GPFS mount).
use.provider.staging=false
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment