Source code for threeML.utils.data_download.Fermi_LAT.download_LAT_data

from __future__ import print_function
from future import standard_library

standard_library.install_aliases()
from builtins import str
import html.parser
import re
import socket
import time
import urllib.request, urllib.parse, urllib.error
import os
import glob

import astropy.io.fits as pyfits

from threeML.io.file_utils import sanitize_filename
from threeML.config.config import threeML_config
from threeML.utils.unique_deterministic_tag import get_unique_deterministic_tag
from threeML.io.download_from_http import ApacheDirectory

# Set default timeout for operations
socket.setdefaulttimeout(120)


[docs]class DivParser(html.parser.HTMLParser):
    """
    Extract data from a <div></div> tag
    """

    def __init__(self, desiredDivName):

        html.parser.HTMLParser.__init__(self)

        self.recording = 0
        self.data = []
        self.desiredDivName = desiredDivName

[docs]    def handle_starttag(self, tag, attributes):

        if tag != "div":
            return

        if self.recording:

            self.recording += 1

            return

        for name, value in attributes:

            if name == "id" and value == self.desiredDivName:

                break

        else:

            return

        self.recording = 1

[docs]    def handle_endtag(self, tag):

        if tag == "div" and self.recording:

            self.recording -= 1

[docs]    def handle_data(self, data):

        if self.recording:

            self.data.append(data)


# Keyword name to store the unique ID for the download
_uid_fits_keyword = "QUERYUID"

[docs]def merge_LAT_data(ft1s, destination_directory=".", outfile='ft1_merged.fits'):

    outfile = os.path.join(destination_directory, outfile)

    if os.path.exists(outfile):
        print(
            "Existing merged event file %s correspond to the same selection. "
            "We assume you did not tamper with it, so we will return it instead of merging it again. "
            "If you want to redo the FT1 file again, remove it from the outdir"
            % (outfile)
        )
        return outfile

    if len(ft1s) == 1:
        print('Only one FT1 file provided. Skipping the merge...')
        import shutil
        shutil.copyfile(ft1s[0],outfile)
        return outfile

    _filelist = "_filelist.txt"

    infile = os.path.join(destination_directory, _filelist)


    infile_list = open(infile,'w')

    for ft1 in ft1s: infile_list.write(ft1 + '\n' )

    infile_list.close()

    from GtApp import GtApp

    gtselect = GtApp('gtselect')

    gtselect['infile']  = '@' + infile
    gtselect['outfile'] = outfile
    gtselect['ra']      = 'INDEF'
    gtselect['dec']     = 'INDEF'
    gtselect['rad']     = 'INDEF'
    gtselect['tmin']    = 'INDEF'
    gtselect['tmax']    = 'INDEF'
    gtselect['emin']    = '30'
    gtselect['emax']    ='1000000'
    gtselect['zmax']    = 180
    gtselect.run()
    return outfile

[docs]def download_LAT_data(
    ra,
    dec,
    radius,
    tstart,
    tstop,
    time_type,
    data_type="Photon",
    destination_directory=".",
):
    """
    Download data from the public LAT data server (of course you need a working internet connection). Data are
    selected in a circular Region of Interest (cone) centered on the provided coordinates.

    Example:

    ```
    > download_LAT_data(195.6, -35.4, 12.0, '2008-09-16 01:00:23', '2008-09-18 01:00:23',
    time_type='Gregorian', destination_directory='my_new_data')
    ```

    :param ra: R.A. (J2000) of the center of the ROI
    :param dec: Dec. (J2000) of the center of the ROI
    :param radius: radius (in degree) of the center of the ROI (use a larger radius than what you will need in the
    analysis)
    :param tstart: start time for the data
    :param tstop: stop time for the data
    :param time_type: type of the time input (one of MET, Gregorian or MJD)
    :param data_type: type of data to download. Use Photon if you use Source or cleaner classes, Extended otherwise.
    Default is Photon.
    :param destination_directory: directory where you want to save the data (default: current directory)
    :return: the path to the downloaded FT1 and FT2 file
    """
    _known_time_types = ["MET", "Gregorian", "MJD"]

    assert time_type in _known_time_types, "Time type must be one of %s" % ",".join(
        _known_time_types
    )

    valid_classes = ["Photon", "Extended"]
    assert data_type in valid_classes, "Data type must be one of %s" % ",".join(
        valid_classes
    )

    assert radius > 0, "Radius of the Region of Interest must be > 0"

    assert 0 <= ra <= 360.0, "R.A. must be 0 <= ra <= 360"
    assert -90 <= dec <= 90, "Dec. must be -90 <= dec <= 90"

    # create output directory if it does not exists
    destination_directory = sanitize_filename(destination_directory, abspath=True)

    if not os.path.exists(destination_directory):

        os.makedirs(destination_directory)

    # This will complete automatically the form available at
    # http://fermi.gsfc.nasa.gov/cgi-bin/ssc/LAT/LATDataQuery.cgi
    # After submitting the form, an html page will inform about
    # the identifier assigned to the query and the time which will be
    # needed to process it. After retrieving the query number,
    # this function will wait for the files to be completed on the server,
    # then it will download them

    url = threeML_config["LAT"]["query form"]

    # Save parameters for the query in a dictionary

    query_parameters = {}
    query_parameters["coordfield"] = "%.4f,%.4f" % (ra, dec)
    query_parameters["coordsystem"] = "J2000"
    query_parameters["shapefield"] = "%s" % radius
    query_parameters["timefield"] = "%s,%s" % (tstart, tstop)
    query_parameters["timetype"] = "%s" % time_type
    query_parameters[
        "energyfield"
    ] = "30,1000000"  # Download everything, we will chose later
    query_parameters["photonOrExtendedOrNone"] = data_type
    query_parameters["destination"] = "query"
    query_parameters["spacecraft"] = "checked"

    # Compute a unique ID for this query
    query_unique_id = get_unique_deterministic_tag(str(query_parameters))

    # Look if there are FT1 and FT2 files in the output directory matching this unique ID

    ft1s = glob.glob(os.path.join(destination_directory, "*PH??.fits"))
    ft2s = glob.glob(os.path.join(destination_directory, "*SC??.fits"))

    # Loop over all ft1s and see if there is any matching the uid

    prev_downloaded_ft1s = []
    prev_downloaded_ft2 = None

    for ft1 in ft1s:

        with pyfits.open(ft1) as f:

            this_query_uid = f[0].header.get(_uid_fits_keyword)

            if this_query_uid == query_unique_id:

                # Found one! Append to the list as there might be others

                prev_downloaded_ft1s.append(ft1)
                #break
                pass
    if len(prev_downloaded_ft1s)>0:

        for ft2 in ft2s:

            with pyfits.open(ft2) as f:

                this_query_uid = f[0].header.get(_uid_fits_keyword)

                if this_query_uid == query_unique_id:
                    # Found one! (FT2 is a single file)
                    prev_downloaded_ft2 = ft2
                    break
    else:
        # No need to look any further, if there is no FT1 file there shouldn't be any FT2 file either
        pass

    # If we have both FT1 and FT2 matching the ID, we do not need to download anymore
    if len(prev_downloaded_ft1s)>0 and prev_downloaded_ft2 is not None:

        print(
            "Existing event file %s and Spacecraft file %s correspond to the same selection. "
            "We assume you did not tamper with them, so we will return those instead of downloading them again. "
            "If you want to download them again, remove them from the outdir"
            % (prev_downloaded_ft1s, prev_downloaded_ft2)
        )

        return merge_LAT_data(prev_downloaded_ft1s, destination_directory, outfile='L%s_FT1.fits' % query_unique_id), prev_downloaded_ft2

    # Print them out

    print("Query parameters:")

    for k, v in query_parameters.items():

        print("%30s = %s" % (k, v))

    # POST encoding

    postData = urllib.parse.urlencode(query_parameters).encode("utf-8")
    temporaryFileName = "__temp_query_result.html"

    # Remove temp file if present

    try:

        os.remove(temporaryFileName)

    except:

        pass

    # This is to avoid caching

    urllib.request.urlcleanup()

    # Get the form compiled
    try:
        urllib.request.urlretrieve(url, temporaryFileName, lambda x, y, z: 0, postData)
    except socket.timeout:

        raise RuntimeError(
            "Time out when connecting to the server. Check your internet connection, or that the "
            "form at %s is accessible, then retry" % url
        )
    except Exception as e:

        print(e)

        raise RuntimeError(
            "Problems with the download. Check your internet connection, or that the "
            "form at %s is accessible, then retry" % url
        )

    # Now open the file, parse it and get the query ID

    with open(temporaryFileName) as htmlFile:

        lines = []

        for line in htmlFile:

            # lines.append(line.encode('utf-8'))
            lines.append(line)

        html = " ".join(lines).strip()

    os.remove(temporaryFileName)

    # Extract data from the response

    parser = DivParser("sec-wrapper")
    parser.feed(html)

    if parser.data == []:

        parser = DivParser("right-side")
        parser.feed(html)

    try:

        # Get line containing the time estimation

        estimatedTimeLine = [
            x
            for x in parser.data
            if x.find("The estimated time for your query to complete is") == 0
        ][0]

        # Get the time estimate

        estimatedTimeForTheQuery = re.findall(
            "The estimated time for your query to complete is ([0-9]+) seconds",
            estimatedTimeLine,
        )[0]

    except:

        raise RuntimeError(
            "Problems with the download. Empty or wrong answer from the LAT server. "
            "Please retry later."
        )

    else:

        print(
            "\nEstimated complete time for your query: %s seconds"
            % estimatedTimeForTheQuery
        )

    http_address = [
        x for x in parser.data if x.find("https://fermi.gsfc.nasa.gov") >= 0
    ][0]

    print(
        "\nIf this download fails, you can find your data at %s (when ready)\n"
        % http_address
    )

    # Now periodically check if the query is complete

    startTime = time.time()
    timeout = max(1.5 * max(5.0, float(estimatedTimeForTheQuery)), 120)  # Seconds
    refreshTime = min(float(estimatedTimeForTheQuery) / 2.0, 5.0)  # Seconds

    # precompile Url regular expression
    regexpr = re.compile("wget (.*.fits)")

    # Now download every tot seconds the status of the query, until we get status=2 (success)

    links = None
    fakeName = "__temp__query__result.html"

    while time.time() <= startTime + timeout:

        # Try and fetch the html with the results

        try:

            _ = urllib.request.urlretrieve(http_address, fakeName,)

        except socket.timeout:

            urllib.request.urlcleanup()

            raise RuntimeError(
                "Time out when connecting to the server. Check your internet connection, or that "
                "you can access %s, then retry" % threeML_config["LAT"]["query form"]
            )

        except Exception as e:

            print(e)

            urllib.request.urlcleanup()

            raise RuntimeError(
                "Problems with the download. Check your connection or that you can access "
                "%s, then retry." % threeML_config["LAT"]["query form"]
            )

        with open(fakeName) as f:

            html = " ".join(f.readlines())

        status = re.findall("The state of your query is ([0-9]+)", html)[0]

        if status == "2":

            # Success! Get the download link
            links = regexpr.findall(html)

            # Remove temp file
            os.remove(fakeName)

            # we're done
            break

        else:

            # Clean up and try again after a while

            os.remove(fakeName)

            urllib.request.urlcleanup()
            time.sleep(refreshTime)

            # Continue to next iteration

    remotePath = "%s/queries/" % threeML_config["LAT"]["public HTTP location"]

    if links != None:

        filenames = [x.split("/")[-1] for x in links]

        print("\nDownloading FT1 and FT2 files...")

        downloader = ApacheDirectory(remotePath)

        downloaded_files = [
            downloader.download(filename, destination_directory)
            for filename in filenames
        ]

    else:

        raise RuntimeError("Could not download LAT Standard data")

    # Now we need to sort so that the FT1 is always first (they might be out of order)

    # Separate the FT1 and FT2 files:

    FT1 = []
    FT2 = None

    for fits_file in downloaded_files:
        # Open the FITS file and write the unique key for this query, so that the download will not be
        # repeated if not necessary
        with pyfits.open(fits_file, mode="update") as f:

            f[0].header.set(_uid_fits_keyword, query_unique_id)

        if re.match(".+SC[0-9][0-9].fits", str(fits_file) ) is not None:

            FT2 = fits_file
        else:

            FT1.append(fits_file)

    # If FT2 is first, switch them, otherwise do nothing
    #if re.match(".+SC[0-9][0-9].fits", downloaded_files[0]) is not None:

    return merge_LAT_data(FT1, destination_directory, outfile='L%s_FT1.fits' % query_unique_id), FT2