Retry requests that failed due to a persistent connection race instead of replying with ERR_ZERO_SIZE_OBJECT "Bad Gateway". The ERR_ZERO_SIZE_OBJECT errors were visible to the client when the destination had only one address because serverDestinations.shift() made the list of destination empty and startConnectionOrFail() failed. When FwdState starts to use a pinned connection, the connection is treated as an idle persistent connection as far as race detection is concerned. Currently, pinned connections cannot be reopened, repinned, and retried after a pconn race. This will change when server-side bumped connections become pinned. It feels wrong that a failed serverConn may remain set while we are opening a new connection to replace it. I added an XXX to mark that preexisting problem. === modified file 'src/forward.cc' --- src/forward.cc 2012-02-10 03:30:02 +0000 +++ src/forward.cc 2012-02-17 00:54:18 +0000 @@ -80,40 +80,41 @@ FwdState::abort(void* d) { FwdState* fwd = (FwdState*)d; Pointer tmp = fwd; // Grab a temporary pointer to keep the object alive during our scope. if (Comm::IsConnOpen(fwd->serverConnection())) { comm_remove_close_handler(fwd->serverConnection()->fd, fwdServerClosedWrapper, fwd); } fwd->serverDestinations.clean(); fwd->self = NULL; } /**** PUBLIC INTERFACE ********************************************************/ FwdState::FwdState(const Comm::ConnectionPointer &client, StoreEntry * e, HttpRequest * r) { debugs(17, 2, HERE << "Forwarding client request " << client << ", url=" << e->url() ); entry = e; clientConn = client; request = HTTPMSGLOCK(r); + pconnRace = raceImpossible; start_t = squid_curtime; serverDestinations.reserve(Config.forward_max_tries); e->lock(); EBIT_SET(e->flags, ENTRY_FWD_HDR_WAIT); } // Called once, right after object creation, when it is safe to set self void FwdState::start(Pointer aSelf) { // Protect ourselves from being destroyed when the only Server pointing // to us is gone (while we expect to talk to more Servers later). // Once we set self, we are responsible for clearing it when we do not // expect to talk to any servers. self = aSelf; // refcounted // We hope that either the store entry aborts or peer is selected. // Otherwise we are going to leak our object. entry->registerAbort(FwdState::abort, this); @@ -308,40 +309,43 @@ if (!err) { ErrorState *anErr = new ErrorState(ERR_CANNOT_FORWARD, HTTP_INTERNAL_SERVER_ERROR, request); anErr->xerrno = errno; fail(anErr); } // else use actual error from last connection attempt self = NULL; // refcounted } } void FwdState::fail(ErrorState * errorState) { debugs(17, 3, HERE << err_type_str[errorState->type] << " \"" << httpStatusString(errorState->httpStatus) << "\"\n\t" << entry->url() ); delete err; err = errorState; if (!errorState->request) errorState->request = HTTPMSGLOCK(request); + if (pconnRace == racePossible && errorState->type == ERR_ZERO_SIZE_OBJECT) + pconnRace = raceHappened; + #if USE_SSL if (errorState->type == ERR_SECURE_CONNECT_FAIL && errorState->detail) request->detailError(errorState->type, errorState->detail->errorNo()); else #endif request->detailError(errorState->type, errorState->xerrno); } /** * Frees fwdState without closing FD or generating an abort */ void FwdState::unregister(Comm::ConnectionPointer &conn) { debugs(17, 3, HERE << entry->url() ); assert(serverConnection() == conn); assert(Comm::IsConnOpen(conn)); comm_remove_close_handler(conn->fd, fwdServerClosedWrapper, this); serverConn = NULL; } @@ -518,40 +522,44 @@ default: return false; } return true; } void FwdState::serverClosed(int fd) { debugs(17, 2, HERE << "FD " << fd << " " << entry->url()); retryOrBail(); } void FwdState::retryOrBail() { if (checkRetry()) { debugs(17, 3, HERE << "re-forwarding (" << n_tries << " tries, " << (squid_curtime - start_t) << " secs)"); + // we should retry the same destination if it failed due to pconn race + if (pconnRace == raceHappened) + debugs(17, 4, HERE << "retrying the same destination"); + else serverDestinations.shift(); // last one failed. try another. startConnectionOrFail(); return; } // TODO: should we call completed() here and move doneWithRetries there? doneWithRetries(); if (self != NULL && !err && shutting_down) { ErrorState *anErr = new ErrorState(ERR_SHUTTING_DOWN, HTTP_SERVICE_UNAVAILABLE, request); errorAppendEntry(entry, anErr); } self = NULL; // refcounted } // If the Server quits before nibbling at the request body, the body sender // will not know (so that we can retry). Call this if we will not retry. We // will notify the sender so that it does not get stuck waiting for space. void @@ -810,97 +818,113 @@ int ftimeout = Config.Timeout.forward - (squid_curtime - start_t); if (ftimeout < 0) ftimeout = 5; if (ftimeout < ctimeout) ctimeout = ftimeout; if (serverDestinations[0]->getPeer() && request->flags.sslBumped == true) { debugs(50, 4, "fwdConnectStart: Ssl bumped connections through parrent proxy are not allowed"); ErrorState *anErr = new ErrorState(ERR_CANNOT_FORWARD, HTTP_SERVICE_UNAVAILABLE, request); fail(anErr); self = NULL; // refcounted return; } request->flags.pinned = 0; // XXX: what if the ConnStateData set this to flag existing credentials? // XXX: answer: the peer selection *should* catch it and give us only the pinned peer. so we reverse the =0 step below. // XXX: also, logs will now lie if pinning is broken and leads to an error message. if (serverDestinations[0]->peerType == PINNED) { ConnStateData *pinned_connection = request->pinnedConnection(); - assert(pinned_connection); + // pinned_connection may become nil after a pconn race + if (pinned_connection) serverConn = pinned_connection->validatePinnedConnection(request, serverDestinations[0]->getPeer()); + else + serverConn = NULL; if (Comm::IsConnOpen(serverConn)) { #if 0 if (!serverConn->getPeer()) serverConn->peerType = HIER_DIRECT; #endif n_tries++; request->flags.pinned = 1; if (pinned_connection->pinnedAuth()) request->flags.auth = 1; + // the server may close the pinned connection before this request + pconnRace = racePossible; dispatch(); return; } /* Failure. Fall back on next path */ debugs(17,2,HERE << " Pinned connection " << pinned_connection << " not valid."); serverDestinations.shift(); startConnectionOrFail(); return; } // Use pconn to avoid opening a new connection. const char *host; if (serverDestinations[0]->getPeer()) { host = serverDestinations[0]->getPeer()->host; } else { host = request->GetHost(); } - Comm::ConnectionPointer temp = fwdPconnPool->pop(serverDestinations[0], host, checkRetriable()); + + Comm::ConnectionPointer temp; + // Avoid pconns after races so that the same client does not suffer twice. + // This does not increase the total number of connections because we just + // closed the connection that failed the race. And re-pinning assumes this. + if (pconnRace != raceHappened) + temp = fwdPconnPool->pop(serverDestinations[0], host, checkRetriable()); + + const bool openedPconn = Comm::IsConnOpen(temp); + pconnRace = openedPconn ? racePossible : raceImpossible; // if we found an open persistent connection to use. use it. - if (Comm::IsConnOpen(temp)) { + if (openedPconn) { serverConn = temp; debugs(17, 3, HERE << "reusing pconn " << serverConnection()); n_tries++; if (!serverConnection()->getPeer()) origin_tries++; comm_add_close_handler(serverConnection()->fd, fwdServerClosedWrapper, this); /* Update server side TOS and Netfilter mark on the connection. */ if (Ip::Qos::TheConfig.isAclTosActive()) { temp->tos = GetTosToServer(request); Ip::Qos::setSockTos(temp, temp->tos); } #if SO_MARK if (Ip::Qos::TheConfig.isAclNfmarkActive()) { temp->nfmark = GetNfmarkToServer(request); Ip::Qos::setSockNfmark(temp, temp->nfmark); } #endif dispatch(); return; } + // XXX: stale serverConn may remain set even though we are opening new conn + #if URL_CHECKSUM_DEBUG entry->mem_obj->checkUrlChecksum(); #endif /* Get the server side TOS and Netfilter mark to be set on the connection. */ if (Ip::Qos::TheConfig.isAclTosActive()) { serverDestinations[0]->tos = GetTosToServer(request); } #if SO_MARK && USE_LIBCAP serverDestinations[0]->nfmark = GetNfmarkToServer(request); debugs(17, 3, "fwdConnectStart: got outgoing addr " << serverDestinations[0]->local << ", tos " << int(serverDestinations[0]->tos) << ", netfilter mark " << serverDestinations[0]->nfmark); #else serverDestinations[0]->nfmark = 0; debugs(17, 3, "fwdConnectStart: got outgoing addr " << serverDestinations[0]->local << ", tos " << int(serverDestinations[0]->tos)); #endif calls.connector = commCbCall(17,3, "fwdConnectDoneWrapper", CommConnectCbPtrFun(fwdConnectDoneWrapper, this)); Comm::ConnOpener *cs = new Comm::ConnOpener(serverDestinations[0], calls.connector, ctimeout); cs->setHost(host); === modified file 'src/forward.h' --- src/forward.h 2011-08-20 15:57:06 +0000 +++ src/forward.h 2012-02-17 00:42:24 +0000 @@ -88,25 +88,29 @@ time_t start_t; int n_tries; int origin_tries; // AsyncCalls which we set and may need cancelling. struct { AsyncCall::Pointer connector; ///< a call linking us to the ConnOpener producing serverConn. } calls; struct { unsigned int connected_okay:1; ///< TCP link ever opened properly. This affects retry of POST,PUT,CONNECT,etc unsigned int dont_retry:1; unsigned int forward_completed:1; } flags; /** connections to open, in order, until successful */ Comm::ConnectionList serverDestinations; Comm::ConnectionPointer serverConn; ///< a successfully opened connection to a server. + /// possible pconn race states + typedef enum { raceImpossible, racePossible, raceHappened } PconnRace; + PconnRace pconnRace; ///< current pconn race state + // NP: keep this last. It plays with private/public CBDATA_CLASS2(FwdState); }; #endif /* SQUID_FORWARD_H */