/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to you under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.stormcrawler.protocol;

import crawlercommons.robots.BaseRobotRules;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.storm.Config;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.util.InitialisationUtil;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.LoggerFactory;

/**
 * Protocol implementation that enables selection from a collection of sub-protocols using filters
 * based on each call's metadata and URL.
 *
 * <p>It is configured like this
 *
 * <pre>
 * protocol.delegator.config:
 * - className: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
 *   filters:
 *     domain: "example.com"
 *     depth: "3"
 *     test
 *   operator: OR
 * - className: "org.apache.stormcrawler.protocol.okhttp.HttpProtocol"
 *   filters:
 *     robots.txt:
 *   regex:
 *    - \.pdf
 *    - \.doc
 * - className: "org.apache.stormcrawler.protocol.selenium.SeleniumProtocol"
 * </pre>
 *
 * Typically, the last one in the list must not have filters as it is used as a default value. The
 * protocols are tried for matches in the order in which they are listed in the configuration. The
 * first to match gets used to fetch a URL.
 *
 * <p>A filter without value is valid, we just test for the presence of the key.
 *
 * <p>A special value <i>robots.txt</i> can be used in the filtering rules to trigger on robots.txt
 * files. This is automatically generated by the DelegatorProtocol, you don't need to add it to the
 * metadata explicitly.
 *
 * <p>The regex are considered a hit if they are found in the URL, they do not have to match the
 * entire URL. The operator logic applies to them as well.
 *
 * @since 2.2
 */
public class DelegatorProtocol implements Protocol {

    private static final String DELEGATOR_CONFIG_KEY = "protocol.delegator.config";

    protected static final org.slf4j.Logger LOG = LoggerFactory.getLogger(DelegatorProtocol.class);

    private static final String ROBOTS = "robots.txt";

    static class Filter {

        String key;
        String value;

        public Filter(String k, String v) {
            key = k;
            value = v;
        }
    }

    static class FilteredProtocol {

        final Protocol protoInstance;
        final List<Filter> filters = new ArrayList<>();
        final String id;

        final List<java.util.regex.Pattern> urlPatterns = new ArrayList<>();

        enum Operator {
            AND,
            OR
        }

        // default
        Operator operator = Operator.AND;

        @NotNull
        Protocol getProtocolInstance() {
            return protoInstance;
        }

        /** Filterless implementation * */
        public FilteredProtocol(
                @Nullable String id, @NotNull String protocolImpl, @NotNull Config config) {
            this(id, protocolImpl, config, null, null, null);
        }

        public FilteredProtocol(
                @Nullable String id,
                @NotNull String protocolImpl,
                @NotNull Config config,
                @Nullable Map<String, String> filterImpls,
                @Nullable String op,
                @Nullable List<String> regexps) {

            protoInstance =
                    InitialisationUtil.initializeFromQualifiedName(protocolImpl, Protocol.class);

            protoInstance.configure(config);

            // instantiate filters
            if (filterImpls != null) {
                filterImpls.forEach((k, v) -> filters.add(new Filter(k, v)));
            }

            if (op != null) {
                this.operator = Operator.valueOf(op);
            }

            // regular expressions
            if (regexps != null) {
                regexps.forEach(s -> urlPatterns.add(Pattern.compile(s)));
            }

            this.id = id;

            // log filters found
            LOG.info(
                    "Loaded {} filters for {}; id {}; operator {}; regexp {}",
                    filters.size(),
                    protocolImpl,
                    id,
                    operator,
                    urlPatterns.size());
        }

        public ProtocolResponse getProtocolOutput(String url, Metadata metadata) throws Exception {
            return protoInstance.getProtocolOutput(url, metadata);
        }

        public BaseRobotRules getRobotRules(String url) {
            return protoInstance.getRobotRules(url);
        }

        public void cleanup() {
            protoInstance.cleanup();
        }

        boolean isMatch(final String url, final Metadata metadata) {
            // if this FP has no filters nor regexps - it can handle anything
            if (filters.isEmpty() && urlPatterns.isEmpty()) return true;

            boolean atLeastOneMatch = false;

            // check that all its filters are satisfied
            for (Filter f : filters) {
                boolean match = true;
                if (f.value == null || f.value.equals("")) {
                    // just interested in the fact that the key exists
                    if (!metadata.containsKey(f.key)) {
                        LOG.trace("Key {} not found in metadata {}", f.key, metadata);
                        match = false;
                    }
                } else {
                    // interested in the value associated with the key
                    if (!metadata.containsKeyWithValue(f.key, f.value)) {
                        LOG.trace(
                                "Key {} not found with value {} in metadata {}",
                                f.key,
                                f.value,
                                metadata);
                        match = false;
                    }
                }
                if (match) atLeastOneMatch = true;

                // optimisation
                if (operator.equals(Operator.AND) && !match) return false;
                else if (operator.equals(Operator.OR) && match) return true;
            }

            // same approach with the URLs
            for (Pattern p : urlPatterns) {
                boolean found = p.asPredicate().test(url);
                if (found) {
                    atLeastOneMatch = true;
                }
                // optimisation
                if (operator.equals(Operator.AND) && !found) return false;
                else if (operator.equals(Operator.OR) && found) return true;
            }

            // if we get to this point and the operator is AND, it means everything has
            // matched
            // but if the operator is OR we need to check that something has matched at all

            if (operator.equals(Operator.OR) && !atLeastOneMatch) return false;

            return true;
        }
    }

    private final LinkedList<FilteredProtocol> protocols = new LinkedList<>();

    @Override
    public void configure(@NotNull Config conf) {
        Object obj = conf.get(DELEGATOR_CONFIG_KEY);

        if (obj == null)
            throw new RuntimeException("DelegatorProtocol declared but no config set for it");

        // should contain a list of maps
        // each map having a className and optionally a number of filters
        if (obj instanceof Iterable) {
            // noinspection unchecked
            for (Map<String, Object> subConf : (Iterable<? extends Map<String, Object>>) obj) {
                final String className = (String) subConf.get("className");
                final Object filters = subConf.get("filters");
                final String operator = (String) subConf.get("operator");
                final String id = (String) subConf.get("id");
                final Object regexp = subConf.get("regex");

                FilteredProtocol protocol;
                if (filters == null && regexp == null) {
                    protocol = new FilteredProtocol(id, className, conf);
                } else {
                    // noinspection unchecked
                    protocol =
                            new FilteredProtocol(
                                    id,
                                    className,
                                    conf,
                                    (Map<String, String>) filters,
                                    operator,
                                    (List<String>) regexp);
                }
                protocols.add(protocol);
            }
        } else { // single value?
            throw new RuntimeException(
                    "DelegatorProtocol declared but single object found in config " + obj);
        }

        if (protocols.isEmpty()) {
            throw new RuntimeException("No sub protocols for delegation protocol defined.");
        }

        // check that the last protocol has no filter
        if (!protocols.peekLast().filters.isEmpty()) {
            throw new RuntimeException(
                    "The last sub protocol has filters but must not as it acts as the default");
        }
    }

    final FilteredProtocol getProtocolFor(String url, Metadata metadata) {

        for (FilteredProtocol p : protocols) {
            if (p.isMatch(url, metadata)) {
                return p;
            }
        }

        return null;
    }

    @Override
    public @NotNull BaseRobotRules getRobotRules(@NotNull String url) {
        final Metadata m = new Metadata();
        m.addValue(ROBOTS, "true");
        FilteredProtocol proto = getProtocolFor(url, m);
        if (proto == null) {
            throw new RuntimeException("No sub protocol eligible to retrieve robots");
        }
        return proto.getRobotRules(url);
    }

    @Override
    public @NotNull ProtocolResponse getProtocolOutput(
            @NotNull String url, @NotNull Metadata metadata) throws Exception {

        // go through the filtered protocols to find which one to use
        FilteredProtocol proto = getProtocolFor(url, metadata);
        if (proto == null) {
            throw new RuntimeException(
                    "No sub protocol eligible to retrieve " + url + "given " + metadata);
        }
        // execute and return protocol with url-meta combo
        return proto.getProtocolOutput(url, metadata);
    }

    @Override
    public void cleanup() {
        for (FilteredProtocol p : protocols) p.cleanup();
    }

    public static void main(String args[]) throws Exception {
        Protocol.main(new DelegatorProtocol(), args);
    }
}
