Source code for threeML.utils.data_download.Fermi_LAT.download_LAT_data

from __future__ import print_function

import glob
import html.parser
import os
import re
import socket
import time
import urllib.error
import urllib.parse
import urllib.request
from builtins import str
from pathlib import Path

import as pyfits

from threeML.config.config import threeML_config
from threeML.exceptions.custom_exceptions import TimeTypeNotKnown
from import ApacheDirectory
from import sanitize_filename
from import setup_logger
from threeML.utils.unique_deterministic_tag import get_unique_deterministic_tag

log = setup_logger(__name__)

# Set default timeout for operations

[docs]class DivParser(html.parser.HTMLParser): """ Extract data from a <div></div> tag """ def __init__(self, desiredDivName): html.parser.HTMLParser.__init__(self) self.recording = 0 = [] self.desiredDivName = desiredDivName
[docs] def handle_starttag(self, tag, attributes): if tag != "div": return if self.recording: self.recording += 1 return for name, value in attributes: if name == "id" and value == self.desiredDivName: break else: return self.recording = 1
[docs] def handle_endtag(self, tag): if tag == "div" and self.recording: self.recording -= 1
[docs] def handle_data(self, data): if self.recording:
# Keyword name to store the unique ID for the download _uid_fits_keyword = "QUERYUID"
[docs]def merge_LAT_data(ft1s, destination_directory: str = ".", outfile: str = 'ft1_merged.fits', Emin: float = 30.0, Emax: float = 1e6) -> Path: outfile: Path = Path(destination_directory) / outfile if outfile.exists(): log.warning( f"Existing merged event file {outfile} correspond to the same selection. " "We assume you did not tamper with it, so we will return it instead of merging it again. " "If you want to redo the FT1 file again, remove it from the outdir" ) return outfile if len(ft1s) == 1: log.warning('Only one FT1 file provided. Skipping the merge...') import shutil shutil.copyfile(ft1s[0], outfile) return outfile _filelist = "_filelist.txt" infile: Path = Path(destination_directory) / _filelist infile_list ='w') for ft1 in ft1s: infile_list.write(str(ft1) + '\n') infile_list.close() from GtApp import GtApp gtselect = GtApp('gtselect') gtselect['infile'] = '@' + str(infile) gtselect['outfile'] = str(outfile) gtselect['ra'] = 'INDEF' gtselect['dec'] = 'INDEF' gtselect['rad'] = 'INDEF' gtselect['tmin'] = 'INDEF' gtselect['tmax'] = 'INDEF' gtselect['emin'] = '%.3f' % Emin gtselect['emax'] = '%.3f' % Emax gtselect['zmax'] = 180 return outfile
[docs]def download_LAT_data( ra: float, dec: float, radius: float, tstart: float, tstop: float, time_type: str, data_type: str = "Photon", destination_directory: str = ".", Emin: float = 30., Emax: float = 1000000. ) -> Path: """ Download data from the public LAT data server (of course you need a working internet connection). Data are selected in a circular Region of Interest (cone) centered on the provided coordinates. Example: ``` > download_LAT_data(195.6, -35.4, 12.0, '2008-09-16 01:00:23', '2008-09-18 01:00:23', time_type='Gregorian', destination_directory='my_new_data') ``` :param ra: R.A. (J2000) of the center of the ROI :param dec: Dec. (J2000) of the center of the ROI :param radius: radius (in degree) of the center of the ROI (use a larger radius than what you will need in the analysis) :param tstart: start time for the data :param tstop: stop time for the data :param time_type: type of the time input (one of MET, Gregorian or MJD) :param data_type: type of data to download. Use Photon if you use Source or cleaner classes, Extended otherwise. Default is Photon. :param destination_directory: directory where you want to save the data (default: current directory) :param Emin: minimum photon energy (in MeV) to download (default: 30 MeV, must be between 30 and 1e6 MeV) :param Emax: maximum photon energy (in MeV) to download (default: 1e6 MeV, must be betwen 30 and 1e6 MeV ) :return: the path to the downloaded FT1 and FT2 file """ _known_time_types = ["MET", "Gregorian", "MJD"] if time_type not in _known_time_types: out = ",".join(_known_time_types) log.error( f"Time type must be one of {out}" ) raise TimeTypeNotKnown() valid_classes = ["Photon", "Extended"] if data_type not in valid_classes: out = ",".join(valid_classes) log.error( f"Data type must be one of {out}" ) raise TypeError() if radius <= 0: log.error( "Radius of the Region of Interest must be > 0" ) raise ValueError() if not (0 <= ra <= 360.0): log.error( "R.A. must be 0 <= ra <= 360" ) raise ValueError() if not -90 <= dec <= 90: log.error( "Dec. must be -90 <= dec <= 90" ) raise ValueError() fermiEmin = 30 fermiEmax = 1e6 if Emin < fermiEmin: log.warning( f"Setting Emin from {Emin} to 30 MeV (minimum available energy for Fermi-LAT data)" ) Emin = fermiEmin if Emin > fermiEmax: log.warning( f"Setting Emin from {Emin} to 1 TeV (maximum available energy for Fermi-LAT data)" ) Emin = fermiEmax if Emax < fermiEmin: log.warning( f"Setting Emax from {Emax} to 30 MeV (minimum available energy for Fermi-LAT data)" ) Emax = fermiEmin if Emax > fermiEmax: log.warning( f"Setting Emax from {Emax} to 1 TeV (maximum available energy for Fermi-LAT data)" ) Emax = fermiEmax if Emin >= Emax: log.error( f"Minimum energy ({Emin}) must be less than maximum energy ({Emax}) for download." ) raise ValueError() # create output directory if it does not exists destination_directory = sanitize_filename( destination_directory, abspath=True) if not destination_directory.exists(): destination_directory.mkdir() # This will complete automatically the form available at # # After submitting the form, an html page will inform about # the identifier assigned to the query and the time which will be # needed to process it. After retrieving the query number, # this function will wait for the files to be completed on the server, # then it will download them url: str = threeML_config.LAT.query_form # Save parameters for the query in a dictionary query_parameters = {} query_parameters["coordfield"] = "%.4f,%.4f" % (ra, dec) query_parameters["coordsystem"] = "J2000" query_parameters["shapefield"] = "%s" % radius query_parameters["timefield"] = "%s,%s" % (tstart, tstop) query_parameters["timetype"] = "%s" % time_type query_parameters["energyfield"] = "%.3f,%.3f" % (Emin, Emax) query_parameters["photonOrExtendedOrNone"] = data_type query_parameters["destination"] = "query" query_parameters["spacecraft"] = "checked" # Print them out"Query parameters:") for k, v in query_parameters.items():"%30s = %s" % (k, v)) # Compute a unique ID for this query query_unique_id = get_unique_deterministic_tag(str(query_parameters)) "Query ID: %s" % query_unique_id) # Look if there are FT1 and FT2 files in the output directory matching this unique ID ft1s = [x for x in destination_directory.glob("*PH??.fits")] ft2s = [x for x in destination_directory.glob("*SC??.fits")] # Loop over all ft1s and see if there is any matching the uid prev_downloaded_ft1s = [] prev_downloaded_ft2 = None for ft1 in ft1s: with as f: this_query_uid = f[0].header.get(_uid_fits_keyword) if this_query_uid == query_unique_id: # Found one! Append to the list as there might be others prev_downloaded_ft1s.append(ft1) # break pass if len(prev_downloaded_ft1s) > 0: for ft2 in ft2s: with as f: this_query_uid = f[0].header.get(_uid_fits_keyword) if this_query_uid == query_unique_id: # Found one! (FT2 is a single file) prev_downloaded_ft2 = ft2 break else: # No need to look any further, if there is no FT1 file there shouldn't be any FT2 file either pass # If we have both FT1 and FT2 matching the ID, we do not need to download anymore if len(prev_downloaded_ft1s) > 0 and prev_downloaded_ft2 is not None: log.warning( f"Existing event file {prev_downloaded_ft1s} and Spacecraft file {prev_downloaded_ft2} correspond to the same selection. " "We assume you did not tamper with them, so we will return those instead of downloading them again. " "If you want to download them again, remove them from the outdir" ) return ( merge_LAT_data( prev_downloaded_ft1s, destination_directory, outfile="L%s_FT1.fits" % query_unique_id, Emin = Emin, Emax = Emax ), prev_downloaded_ft2, ) # POST encoding postData = urllib.parse.urlencode(query_parameters).encode("utf-8") temporaryFileName = "__temp_query_result.html" # Remove temp file if present try: os.remove(temporaryFileName) except: pass # This is to avoid caching urllib.request.urlcleanup() # Get the form compiled try: urllib.request.urlretrieve( url, temporaryFileName, lambda x, y, z: 0, postData) except socket.timeout: log.error( "Time out when connecting to the server. Check your internet connection, or that the " f"form at {url} is accessible, then retry" ) raise RuntimeError( ) except Exception as e: log.error(e) log.exception("Problems with the download. Check your internet connection, or that the " f"form at {url} is accessible, then retry") raise RuntimeError( ) # Now open the file, parse it and get the query ID with open(temporaryFileName) as htmlFile: lines = [] for line in htmlFile: # lines.append(line.encode('utf-8')) lines.append(line) html = " ".join(lines).strip() os.remove(temporaryFileName) # Extract data from the response parser = DivParser("sec-wrapper") parser.feed(html) if == []: parser = DivParser("right-side") parser.feed(html) try: # Get line containing the time estimation estimatedTimeLine = [ x for x in if x.find("The estimated time for your query to complete is") == 0 ][0] # Get the time estimate estimated_time_for_the_query = re.findall( "The estimated time for your query to complete is ([0-9]+) seconds", estimatedTimeLine, )[0] except: raise RuntimeError( "Problems with the download. Empty or wrong answer from the LAT server. " "Please retry later." ) else: f"Estimated complete time for your query: {estimated_time_for_the_query} seconds" ) http_address = [ x for x in if x.find("") >= 0 ][0] f"If this download fails, you can find your data at {http_address} (when ready)" ) # Now periodically check if the query is complete startTime = time.time() timeout = max( 1.5 * max(5.0, float(estimated_time_for_the_query)), 120) # Seconds refreshTime = min(float(estimated_time_for_the_query) / 2.0, 5.0) # Seconds # precompile Url regular expression regexpr = re.compile("wget (.*.fits)") # Now download every tot seconds the status of the query, until we get status=2 (success) links = None fakeName = "__temp__query__result.html" while time.time() <= startTime + timeout: # Try and fetch the html with the results try: _ = urllib.request.urlretrieve( http_address, fakeName, ) except socket.timeout: urllib.request.urlcleanup() log.exception( "Time out when connecting to the server. Check your internet connection, or that " f"you can access {threeML_config.LAT.query_form}, then retry") raise RuntimeError( ) except Exception as e: log.error(e) urllib.request.urlcleanup() log.exception("Problems with the download. Check your connection or that you can access " f"{threeML_config.LAT.query_form}, then retry.") raise RuntimeError( ) with open(fakeName) as f: html = " ".join(f.readlines()) status = re.findall("The state of your query is ([0-9]+)", html)[0] if status == "2": # Success! Get the download link links = regexpr.findall(html) # Remove temp file os.remove(fakeName) # we're done break else: # Clean up and try again after a while os.remove(fakeName) urllib.request.urlcleanup() time.sleep(refreshTime) # Continue to next iteration remotePath = "%s/queries/" % threeML_config.LAT.public_http_location if links != None: filenames = [x.split("/")[-1] for x in links]"Downloading FT1 and FT2 files...") downloader = ApacheDirectory(remotePath) downloaded_files = [, destination_directory) for filename in filenames ] else: log.error( "Could not download LAT Standard data" ) raise RuntimeError() # Now we need to sort so that the FT1 is always first (they might be out of order) # Separate the FT1 and FT2 files: FT1 = [] FT2 = None for fits_file in downloaded_files: # Open the FITS file and write the unique key for this query, so that the download will not be # repeated if not necessary with, mode="update") as f: f[0].header.set(_uid_fits_keyword, query_unique_id) if re.match(".+SC[0-9][0-9].fits", str(fits_file)) is not None: FT2 = fits_file else: FT1.append(fits_file) # If FT2 is first, switch them, otherwise do nothing # if re.match(".+SC[0-9][0-9].fits", downloaded_files[0]) is not None: return ( merge_LAT_data( FT1, destination_directory, outfile="L%s_FT1.fits" % query_unique_id, Emin = Emin, Emax = Emax ), FT2 )