from subprocess import run from functools import partial run = partial(run, text=True, capture_output=True) def running_crawls(): """Get details of currently running spiders. Get a DataFrame showing the following details: * pid: Process ID. Use this to identify (or stop) the spider that you want. * started: The time when this spider has started. * elapsed: The elapsed time since the spider started. * %mem: The percentage of memory that this spider is consuming. * %cpu: The percentage of CPU that this spider is consuming. * args: The full command that was used to start this spider. Use this to identify the spider(s) that you want to know about. * output_file: The path to the output file for each running crawl job. * crawled_urls: The current number of lines in ``output_file``. """ ps = run(['ps', 'xo', 'pid,start,etime,%mem,%cpu,args']) ps_stdout = ps.stdout.splitlines() df = pd.DataFrame([line.split(maxsplit=5) for line in ps_stdout[1:]], columns=ps_stdout[0].split()) df['output_file'] = df['ARGS'].str.extract('-o (.*?\.jl)')[0] df_subset = df[df['ARGS'].str.contains('scrapy runspider')].reset_index(drop=True) if df_subset.empty: return pd.DataFrame() crawled_lines = run(['wc', '-l'] + df['output_file'].str.cat(sep=' ').split()) crawl_urls = [int(line.strip().split()[0]) for line in crawled_lines.stdout.splitlines()] crawl_urls = crawl_urls[:min(len(crawl_urls), len(df_subset))] df_subset['crawled_urls'] = crawl_urls df_subset.columns = df_subset.columns.str.lower() return df_subset