package com.k_int.ia.harvest.web;

import com.k_int.ia.content_analysis.ContentAnalysisException;
import com.k_int.ia.content_analysis.ResourceInformation;
import com.k_int.ia.content_analysis.StreamAnalyser;
import com.k_int.ia.harvest.HarvestJob;
import com.k_int.ia.harvest.HarvestService;
import com.k_int.ia.harvest.util.Md5ChecksumBuilder;
import com.k_int.ia.metadata_submission.MetadataSubmissionService;
import com.k_int.ia.metadata_submission.SubmissionException;
import com.k_int.ia.util.DataHelper;
import java.io.IOException;
import java.net.ConnectException;
import java.net.MalformedURLException;
import java.net.URL;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import net.sf.hibernate.HibernateException;
import net.sf.hibernate.Session;
import net.sf.hibernate.SessionFactory;
import org.apache.axis.transport.http.HTTPConstants;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpRecoverableException;
import org.apache.commons.httpclient.SimpleHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.Scriptable;
import org.springframework.context.ApplicationContext;

/* loaded from: input_file:WEB-INF/lib/inode_data_layer-1.1.1.jar:com/k_int/ia/harvest/web/WebHarvestJob.class */
public class WebHarvestJob extends HarvestJob {
    private ApplicationContext ctx;
    private HarvestService harvest_svc;
    private StreamAnalyser sa;
    private MetadataSubmissionService metadata_submission_service;
    private HarvestTaskInfo root_task;
    private Long task_id;
    protected static Log log = LogFactory.getLog(WebHarvestJob.class);
    private WebSynchronizationTaskHDO task = null;
    private int records_processed = 0;
    private LinkedList work_queue = new LinkedList();
    private List processed = new ArrayList();
    private List pending = new ArrayList();
    private List md5 = new ArrayList();
    private Map associated_items = new HashMap();
    private boolean running = false;
    private Session session = null;
    private Context js_context = null;
    private Function pre_index = null;
    private Function post_index = null;
    private Scriptable scope = null;

    public WebHarvestJob(Long l, ApplicationContext applicationContext, HarvestService harvestService) {
        this.ctx = null;
        this.harvest_svc = null;
        this.sa = null;
        this.metadata_submission_service = null;
        this.task_id = l;
        this.ctx = applicationContext;
        this.harvest_svc = harvestService;
        this.metadata_submission_service = (MetadataSubmissionService) applicationContext.getBean("SubmissionService");
        this.sa = (StreamAnalyser) applicationContext.getBean("ResourceSummariser");
    }

    @Override // java.lang.Runnable
    public void run() {
        HarvestTaskInfo harvestTaskInfo;
        try {
            try {
                this.session = ((SessionFactory) this.ctx.getBean("INodeSessionFactory")).openSession();
                this.task = DataHelper.lookupHarvestInstructionById(this.session, this.task_id);
                log.debug("WebHarvestJob::WebHarvestJob - guid=" + this.task.getQueuedBy());
                log.debug("Running Web Harvest Job - " + this.task.getBaseUrl());
                String taskScript = this.task.getTaskScript();
                log.debug("Configuring task_script");
                this.js_context = Context.enter();
                this.scope = this.js_context.initStandardObjects();
                this.js_context.evaluateString(this.scope, taskScript, "IndexTaskScript", 1, null);
                this.pre_index = (Function) this.scope.get("preIndex", this.scope);
                this.post_index = (Function) this.scope.get("postIndex", this.scope);
                log.debug("Completed init of task script");
                this.root_task = new HarvestTaskInfo(new URL(this.task.getBaseUrl()), this.task.getDefaultDepth().longValue(), null, this.task.getIncludePatterns(), this.task.getExcludePatterns(), this.task.getIndexPatterns(), this.task.getAdditionalMetadata(), this.task.getDefaultTermAuthority(), null, null);
                addURL(this.root_task);
                this.running = true;
                while (this.running && this.work_queue.size() > 0) {
                    synchronized (this.work_queue) {
                        harvestTaskInfo = (HarvestTaskInfo) this.work_queue.removeFirst();
                    }
                    log.debug("Processing next item (queue size=" + this.work_queue.size() + "): " + harvestTaskInfo + "parent=" + harvestTaskInfo.parent);
                    processNext(harvestTaskInfo);
                }
                if (this.session != null) {
                    log.debug("Closing session - Interval is " + this.task.getInterval());
                    Date date = new Date(System.currentTimeMillis() + this.task.getInterval().longValue());
                    log.debug("next run scheduled for " + date);
                    this.task.setQueuedBy(null);
                    this.task.setNextDue(date);
                    this.session.update(this.task);
                    this.session.flush();
                    this.session.connection().commit();
                    this.session.close();
                }
                if (this.session != null) {
                    try {
                        this.session.close();
                    } catch (Exception e) {
                    }
                }
            } catch (Throwable th) {
                if (this.session != null) {
                    try {
                        this.session.close();
                    } catch (Exception e2) {
                    }
                }
                throw th;
            }
        } catch (Exception e3) {
            log.warn("Problem closing session", e3);
            if (this.session != null) {
                try {
                    this.session.close();
                } catch (Exception e4) {
                }
            }
        }
    }

    private void processNext(HarvestTaskInfo harvestTaskInfo) {
        long currentTimeMillis;
        this.pending.remove(harvestTaskInfo.url.toString());
        log.debug("processing " + harvestTaskInfo.url + " remaining depth=" + harvestTaskInfo.depth);
        URL url = harvestTaskInfo.url;
        HttpMethod httpMethod = null;
        try {
            try {
                try {
                    try {
                        try {
                            HttpClient httpClient = new HttpClient(new SimpleHttpConnectionManager());
                            GetMethod getMethod = new GetMethod(url.toString());
                            httpClient.setTimeout(60000);
                            httpClient.setConnectionTimeout(60000);
                            getMethod.setFollowRedirects(true);
                            getMethod.setStrictMode(false);
                            getMethod.setRequestHeader(HTTPConstants.HEADER_USER_AGENT, "OpenHarvestRobot/1.1");
                            getMethod.setRequestHeader("Pragma", HTTPConstants.HEADER_CACHE_CONTROL_NOCACHE);
                            httpClient.executeMethod(getMethod);
                            if (getMethod.getStatusCode() == 200) {
                                getMethod.getResponseHeaders();
                                Header responseHeader = getMethod.getResponseHeader(HTTPConstants.HEADER_CONTENT_TYPE);
                                getMethod.getResponseHeader(HTTPConstants.HEADER_CONTENT_LENGTH);
                                Header responseHeader2 = getMethod.getResponseHeader("last-modified");
                                Header responseHeader3 = getMethod.getResponseHeader(SchemaSymbols.ATTVAL_DATE);
                                Header responseHeader4 = getMethod.getResponseHeader("Content-Language");
                                getMethod.getResponseHeader("Content-Range");
                                Header responseHeader5 = getMethod.getResponseHeader(HTTPConstants.HEADER_CONTENT_ENCODING);
                                String str = responseHeader != null ? responseHeader.getValue().split(";")[0] : null;
                                if (str != null && this.sa.isSupported(str)) {
                                    try {
                                        Thread.sleep(1000L);
                                    } catch (InterruptedException e) {
                                    }
                                    String checksumString = Md5ChecksumBuilder.getChecksumString(getMethod.getResponseBodyAsStream());
                                    log.debug("Checksum String = " + checksumString);
                                    if (this.md5.contains(checksumString)) {
                                        log.debug("Not processing - Already visited a this URL or a resource with this MD5 for " + url);
                                        this.processed.add(getMethod.getURI().toString());
                                    } else {
                                        this.md5.add(checksumString);
                                        this.processed.add(getMethod.getURI().toString());
                                        boolean z = false;
                                        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss z");
                                        simpleDateFormat.setLenient(true);
                                        if (responseHeader2 != null) {
                                            log.debug("parsing : " + responseHeader2.getValue());
                                            Date parse = simpleDateFormat.parse(responseHeader2.getValue(), new ParsePosition(0));
                                            currentTimeMillis = parse != null ? parse.getTime() : 0L;
                                        } else {
                                            currentTimeMillis = System.currentTimeMillis();
                                        }
                                        if (responseHeader3 != null) {
                                            simpleDateFormat.parse(responseHeader3.getValue(), new ParsePosition(0)).getTime();
                                        }
                                        String value = responseHeader4 != null ? responseHeader4.getValue() : "";
                                        if (responseHeader5 != null) {
                                            responseHeader5.getValue();
                                        }
                                        if (currentTimeMillis > 0) {
                                            harvestTaskInfo.logMessage("Last Modified: " + new Date(currentTimeMillis));
                                        }
                                        if (value != null) {
                                            harvestTaskInfo.logMessage("Resource language: " + value);
                                        }
                                        ResourceData lookupResourceData = lookupResourceData(this.root_task, url.toString());
                                        if (lookupResourceData == null) {
                                            lookupResourceData = new ResourceData();
                                        }
                                        log.debug("Processing: " + url.toString());
                                        log.debug("last modified from records is : " + lookupResourceData.getLastModified() + " from web=" + new Timestamp(currentTimeMillis));
                                        log.debug("checksum from records is : " + lookupResourceData.getChecksum() + " from web=" + checksumString);
                                        if (lookupResourceData.getLastModified() == null || (lookupResourceData.getLastModified().getTime() < currentTimeMillis && !checksumString.equals(lookupResourceData.getChecksum()))) {
                                            harvestTaskInfo.logMessage("Last Modified from state record is: " + lookupResourceData.getLastModified());
                                            log.debug("Mark as modified");
                                            z = true;
                                        }
                                        log.debug("Processing content type " + str + ", url=" + url);
                                        ResourceInformation analyse = this.sa.analyse(url, harvestTaskInfo.additional_metadata_fragments, harvestTaskInfo.default_term_authority);
                                        if (z) {
                                            lookupResourceData.setLastModified(new Timestamp(currentTimeMillis));
                                            lookupResourceData.setChecksum(checksumString);
                                            harvestTaskInfo.logMessage("Submitted for indexing - set last modified to " + lookupResourceData.getLastModified() + " chk=" + lookupResourceData.getChecksum());
                                            log.debug("** Submitted for indexing - set last modified to " + lookupResourceData.getLastModified() + " chk=" + lookupResourceData.getChecksum() + " **");
                                            boolean z2 = false;
                                            if (this.root_task.index_patterns != null && this.root_task.index_patterns.size() > 0) {
                                                z2 = checkRegExp(this.root_task.index_patterns.iterator(), url.toExternalForm());
                                            }
                                            if (z2) {
                                                log.debug(harvestTaskInfo.url + " is an index page - no data added");
                                            } else {
                                                Object[] objArr = {harvestTaskInfo, analyse, this.ctx};
                                                Context context = this.js_context;
                                                boolean z3 = Context.toBoolean(this.pre_index.call(this.js_context, this.scope, this.scope, objArr));
                                                log.debug("Result of pre_index = " + z3);
                                                if (z3) {
                                                    try {
                                                        this.metadata_submission_service.submit(url.toString(), (String) null, (String) null, "Open Harvest Web Robot : " + url.toString(), "OpenHarvestWebRobot", (String) null, this.task.getRecordPublicationCollectionId(), analyse.getRDF(), checksumString);
                                                        this.post_index.call(this.js_context, this.scope, this.scope, objArr);
                                                    } catch (SubmissionException e2) {
                                                        e2.printStackTrace();
                                                    }
                                                } else {
                                                    log.debug("Resource " + harvestTaskInfo.url + " failed preIndex condition");
                                                }
                                            }
                                        } else {
                                            harvestTaskInfo.logMessage("Not modified since last examined");
                                        }
                                        if (harvestTaskInfo.depth > 0) {
                                            for (URL url2 : analyse.getLinkage()) {
                                                boolean z4 = true;
                                                if (!this.processed.contains(url2.toString())) {
                                                    if (this.root_task.exclude_patterns != null && this.root_task.exclude_patterns.size() > 0) {
                                                        z4 = !checkRegExp(this.root_task.exclude_patterns.iterator(), url2.toExternalForm());
                                                    }
                                                    if (z4 && this.root_task.include_patterns != null && this.root_task.include_patterns.size() > 0 && !checkRegExp(this.root_task.include_patterns.iterator(), url2.toExternalForm())) {
                                                        z4 = false;
                                                    }
                                                    if (z4) {
                                                        HarvestTaskInfo harvestTaskInfo2 = new HarvestTaskInfo(url2, harvestTaskInfo.depth - 1, harvestTaskInfo);
                                                        harvestTaskInfo.addChild(harvestTaskInfo2);
                                                        addURL(harvestTaskInfo2);
                                                    }
                                                }
                                            }
                                        } else {
                                            log.debug("Max depth reached");
                                        }
                                        if (z) {
                                            log.debug("Resource marked as modified... process");
                                            updateResourceData(this.root_task, url.toString(), lookupResourceData);
                                        }
                                    }
                                }
                            } else {
                                log.debug("HTTP Get not ok");
                                this.root_task.status = 1;
                                harvestTaskInfo.logMessage("HTTP Get Failed - bad status code:" + getMethod.getStatusCode() + " while processing " + harvestTaskInfo + " from " + harvestTaskInfo.parent);
                            }
                            if (getMethod != null) {
                                getMethod.releaseConnection();
                            }
                        } catch (HttpRecoverableException e3) {
                            log.warn("Exception while examining " + harvestTaskInfo.url, e3);
                            harvestTaskInfo.logMessage("Cannot be processed httpclient exception while processing " + harvestTaskInfo.url);
                            this.root_task.status = 1;
                            if (0 != 0) {
                                httpMethod.releaseConnection();
                            }
                        }
                    } catch (IOException e4) {
                        log.warn("Exception while examining " + harvestTaskInfo.url, e4);
                        harvestTaskInfo.logMessage("Cannot be processed because there was an IO exception while processing " + harvestTaskInfo.url);
                        this.root_task.status = 1;
                        if (0 != 0) {
                            httpMethod.releaseConnection();
                        }
                    }
                } catch (ConnectException e5) {
                    this.root_task.status = 1;
                    log.warn("Exception while examining " + harvestTaskInfo.url, e5);
                    harvestTaskInfo.logMessage("Cannot be processed beause the timeout was exceeded while processing " + harvestTaskInfo.url);
                    if (0 != 0) {
                        httpMethod.releaseConnection();
                    }
                } catch (MalformedURLException e6) {
                    log.warn("Exception while examining " + harvestTaskInfo.url, e6);
                    harvestTaskInfo.logMessage("Cannot be processed because the URL is malformed: " + harvestTaskInfo.url);
                    this.root_task.status = 1;
                    if (0 != 0) {
                        httpMethod.releaseConnection();
                    }
                }
            } catch (Throwable th) {
                if (0 != 0) {
                    httpMethod.releaseConnection();
                }
                throw th;
            }
        } catch (ContentAnalysisException e7) {
            log.warn("Exception while examining " + harvestTaskInfo + " child url of " + harvestTaskInfo.parent, e7);
            harvestTaskInfo.logMessage("Problem :" + e7.toString());
            this.root_task.status = 1;
            if (0 != 0) {
                httpMethod.releaseConnection();
            }
        } catch (Exception e8) {
            log.warn("Exception while processing " + harvestTaskInfo.url, e8);
            harvestTaskInfo.logMessage("Unhandled exception : " + e8.toString());
            this.root_task.status = 1;
            if (0 != 0) {
                httpMethod.releaseConnection();
            }
        }
    }

    public void addURL(URL url, int i, HarvestTaskInfo harvestTaskInfo) {
        if (this.pending.contains(url.toString())) {
            return;
        }
        this.pending.add(url.toString());
        this.work_queue.addLast(new HarvestTaskInfo(url, i, harvestTaskInfo));
    }

    public void addURL(HarvestTaskInfo harvestTaskInfo) {
        if (this.pending.contains(harvestTaskInfo.url.toString())) {
            return;
        }
        this.pending.add(harvestTaskInfo.url.toString());
        this.work_queue.addLast(harvestTaskInfo);
    }

    private boolean checkRegExp(Iterator it, String str) throws RESyntaxException {
        boolean z = false;
        while (it.hasNext() && !z) {
            if (new RE((String) it.next()).match(str)) {
                z = true;
            }
        }
        return z;
    }

    private void updateResourceData(HarvestTaskInfo harvestTaskInfo, String str, ResourceData resourceData) throws HibernateException, SQLException {
        this.task.getResources().remove(str);
        this.task.getResources().put(str, resourceData);
        this.session.update(this.task);
        this.session.flush();
        this.session.connection().commit();
    }

    private ResourceData lookupResourceData(HarvestTaskInfo harvestTaskInfo, String str) {
        return (ResourceData) this.task.getResources().get(str);
    }
}
